In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

### Clean csv

In [13]:
df = pd.read_csv("imdb_movies.csv") #set dataframe

#get rid of columns such as orig_titlem budget_x, revenue, country
df.drop(df.columns[-6:], axis=1, inplace=True)
#get rid of date
df.drop(df.columns[1], axis=1, inplace=True)
df.head()

Unnamed: 0,names,score,genre,overview,crew
0,Creed III,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso..."
1,Avatar: The Way of Water,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt..."
2,The Super Mario Bros. Movie,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P..."
3,Mummies,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor..."
4,Supercell,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin..."


In [18]:
# Remove rows with any empty cells
df.dropna(inplace=True)
print(df['genre'].isna().sum())


0


In [19]:
print(df['genre'].dtype)
print(type(df['genre'][1]))  # Assuming the first entry in the 'genre' column is representative


object
<class 'str'>


### Preprocess data

In [20]:
nlp = spacy.load("en_core_web_sm")

#each word becomes token
def preprocess(text):
    tokens = [token.text.lower() for token in nlp(text)]
    
    #removes stopwords like ["the", "a", "and"]
    tokens = [token for token in tokens if token not in STOP_WORDS and token not in string.punctuation]
    
    #removes 'ing' like boxing became box
    lemma_tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    
    #put it into single string
    single_string = " ".join(lemma_tokens)
    
    return single_string

df['preprocessed_synopsis'] = df['overview'].apply(preprocess)
df['preprocessed_genre'] = df['genre'].apply(preprocess)
df.head()

Unnamed: 0,names,score,genre,overview,crew,preprocessed_synopsis,preprocessed_genre,tokenized_crew
0,Creed III,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",dominate box world adonis creed thrive career ...,drama action,"[Michael B. Jordan, Adonis Creed, Tessa Thomps..."
1,Avatar: The Way of Water,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",set decade event film learn story sully family...,science fiction adventure action,"[Sam Worthington, Jake Sully, Zoe Saldaña, Ney..."
2,The Super Mario Bros. Movie,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",work underground fix water main brooklyn plumb...,animation adventure family fantasy ...,"[Chris Pratt, Mario (voice), Anya Taylor-Joy, ..."
3,Mummies,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",series unfortunate event mummy end present day...,animation comedy family adventure ...,"[Óscar Barberán, Thut (voice), Ana Esther Albo..."
4,Supercell,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",good hearted teenager william live hope follow...,action,"[Skeet Ulrich, Roy Cameron, Anne Heche, Dr Qui..."


In [None]:
assert df['score'].min() >= 0, "Scores must be non-negative."
assert df['score'].max() <= 100, "Scores must not exceed 100."

In [None]:
# Checking for duplicate movie titles in the 'names' column
duplicate_titles = df['names'].duplicated().sum()

# If duplicates exist, we will remove them, keeping the first occurrence
if duplicate_titles > 0:
    df = df.drop_duplicates(subset='names', keep='first')
else:
    df = df.copy()

duplicate_titles, df.shape