In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval ## literal_eval converts into python object
master_dataset = pd.read_csv(
    r"C:\Users\WIN\OneDrive\Desktop\WiDS 5.0\final_movies_df.csv"
)
## Updating cast, crew and keyword columns by parsing them as their loaded data type is string but need to be converted to list
master_dataset['cast']      = master_dataset['cast'].apply(literal_eval)
master_dataset['crew']      = master_dataset['crew'].apply(literal_eval)
master_dataset['keywords']  = master_dataset['keywords'].apply(literal_eval)

def get_director(x):
    """
    Extract the Name of the Director for a movie if it is present inside the job
    
    """
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

## Updating cast to maintain proportion between different lengths (keeping top 3 cast members)
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

## Setting keywords to empty list if does not exists, otherwise taking into account for each word as keyword
master_dataset['keywords']  = master_dataset['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

## Extracting directory names from the crew
master_dataset['director']  = master_dataset['crew'].apply(get_director)

## for uniqueness, removing all the spaces in between the names
master_dataset['cast']          = master_dataset['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

## Maintaining the original director name as main director
master_dataset['main_director'] = master_dataset['director']

## Maintaining the number of director to maintain proportion (similar to cast column above)
master_dataset['director']      = master_dataset['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
master_dataset['director']      = master_dataset['director'].apply(lambda x: [x,x,x])

## Will try to map where more than 1 keyword is present for the movie
#s = s[s > 1]

from nltk.stem.snowball import SnowballStemmer

# Creating a stemmer object for English
stemmer = SnowballStemmer('english')

# Stem each keyword
master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [stemmer.stem(i) for i in x if len(i) > 1]
)

# Convert to lowercase and remove spaces
master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [i.replace(" ", "").lower() for i in x]
)

master_dataset['keywords'].head(3)

# Ensure all columns are lists
for col in ['keywords', 'cast', 'director']:
    master_dataset[col] = master_dataset[col].apply(
        lambda x: x if isinstance(x, list) else []
    )

# Create soup
master_dataset['soup'] = (
    master_dataset['keywords'] +
    master_dataset['cast'] +
    master_dataset['director']
)

master_dataset['soup'] = master_dataset['soup'].apply(lambda x: ' '.join(x))

# CONFIRM execution reaches here
print("Saving file...")
print(master_dataset.shape)

master_dataset.to_csv(
    r"C:\Users\WIN\OneDrive\Desktop\WiDS 5.0\master_dataset_new.csv",
    index=False
)

print("File saved successfully")
