### Splitting the existing data we have
Operating this data on its own is time consuming, we will instead use pythont o split it in smaller csv files. 
I split it along the lines of:
1. Episode
2. Character
3. Season


### STEP 1:
importing pandas and os, this will make it easy to split and save the data. 

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import os
from collections import Counter
import re



### STEP 2:
give the file_path of the entire script I have, and then read it through pandas into DataFrame

In [2]:
file_path = 'game_of_thrones_data.csv'
df = pd.read_csv(file_path)
# print(df.head(n = 10))
# returns the first 10 records. 

### STEP 3:
creating directories using os library and indicate that it is fine if the directory already exists. 

In [3]:
os.makedirs('Seasons', exist_ok=True)
os.makedirs('Name', exist_ok="True")
os.makedirs('Episode', exist_ok=True)

### STEP 4:
1. Seasonal Split, split each seasonal df into multiple dfs based on unique Season values. 
2. Same thing for Episodes.
3. Same things for (a few characters).

In [5]:
for season in df['Season'].unique():
    season_df = df[df['Season'] == season]
    season_df.to_csv(f'Seasons/{season}.csv', index=False) 

for ep in df['Episode'].unique():
    episode_df = df[df['Episode'] == ep]
    episode_df.to_csv(f'Episode/{ep}.csv', index = False)

def normalize_name(name):
    if isinstance(name, str):
        return re.sub(r'\s+', ' ', name).strip().lower()
    return '' 

top_characters = [
    'Will', 'Rickard Karstark', 'Syrio Forel', 'Kraznys mo Nakloz', 'Lyanna Stark',
    'Maester Lewin', 'Prince Doran Martell', 'Leaf', 'Mirri Maz Duur', 'Janos Slynt',
    'Euron Greyjoy', 'Lady Crane', 'Qhorin Halfhand', 'Robin Arryn', 'Lysa Arryn',
    'Ros', 'Hot Pie', 'Meryn Trant', 'Selyse Baratheon', 'Rickon Stark', 'Jeor Mormont',
    'The Waif', 'Wun Wun', 'Pyat Pree', 'Obara Sand', 'Edd Tollett', 'Three-Eyed Raven',
    'Tommen Baratheon', 'Balon Greyjoy', 'Mance Rayder', 'Craster', 'Maester Aemon',
    'Mace Tyrell', 'Maester Pycelle', 'Lancel Lannister', 'Olly', 'Osha', 'Alliser Thorne',
    'Talisa Maegyr', 'Qyburn', 'Gendry', 'Benjen Stark', 'Barristan Selmy', 'Edmure Tully',
    'Beric Dondarrion', 'Missandei', 'Jojen Reed', 'Thoros of Myr', 'Brynden "The Blackfish" Tully',
    'Shae', 'Hodor', 'Khal Drogo', 'Loras Tyrell', 'Daario Naharis', 'Ellaria Sand',
    'High Sparrow', 'Jaqen H\'ghar', 'Renly Baratheon', 'Shireen Baratheon', 'Meera Reed',
    'Podrick Payne', 'Yara Greyjoy', 'Grey Worm', 'Tormund Giantsbane', 'Gilly',
    'Robert Baratheon', 'Gregor Clegane', 'The Night King', 'Ygritte', 'Samwell Tarly',
    'Walder Frey', 'Oberyn Martell', 'Robb Stark', 'Roose Bolton', 'Ramsay Bolton',
    'Olenna Tyrell', 'Jorah Mormont', 'Stannis Baratheon', 'Bronn', 'Margaery Tyrell',
    'Theon Greyjoy', 'Melisandre', 'Sandor "The Hound" Clegane', 'Bran Stark', 'Joffrey Baratheon',
    'Petyr "Littlefinger" Baelish', 'Brienne of Tarth', 'Davos Seaworth', 'Sansa Stark',
    'Catelyn Stark', 'Varys', 'Tywin Lannister', 'Eddard Stark', 'Jaime Lannister',
    'Arya Stark', 'Cersei Lannister', 'Jon Snow', 'Daenerys Targaryen', 'Tyrion Lannister'
]

normalized_top_characters = {normalize_name(char): char for char in top_characters}

for name in df['Name'].unique():
    normalized_name = normalize_name(name)
    if normalized_name in normalized_top_characters:
        name_df = df[df['Name'] == name]
        file_name = f'Name/{normalized_top_characters[normalized_name]}.csv'
        name_df.to_csv(file_name, index=False)
