In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from sklearn.decomposition import PCA
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.metrics.pairwise import cosine_similarity

### Clean csv

In [2]:
df = pd.read_csv("imdb_movies.csv") #set dataframe

#get rid of columns such as orig_titlem budget_x, revenue, country
df.drop(df.columns[-6:], axis=1, inplace=True)
#get rid of date
df.drop(df.columns[1], axis=1, inplace=True)
df.head()

Unnamed: 0,names,score,genre,overview,crew
0,Creed III,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso..."
1,Avatar: The Way of Water,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt..."
2,The Super Mario Bros. Movie,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P..."
3,Mummies,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor..."
4,Supercell,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin..."


In [3]:
# Remove rows with any empty cells
df.dropna(inplace=True)
print(df['genre'].isna().sum())


0


### Preprocess data

In [4]:
nlp = spacy.load("en_core_web_sm")

#each word becomes token
def preprocess(text):
    tokens = [token.text.lower() for token in nlp(text)]
    
    #removes stopwords like ["the", "a", "and"]
    tokens = [token for token in tokens if token not in STOP_WORDS and token not in string.punctuation]
    
    #removes 'ing' like boxing became box
    lemma_tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    
    #put it into single string
    single_string = " ".join(lemma_tokens)
    
    return single_string

df['preprocessed_synopsis'] = df['overview'].apply(preprocess)
df['preprocessed_genre'] = df['genre'].apply(preprocess)
df.head()

Unnamed: 0,names,score,genre,overview,crew,preprocessed_synopsis,preprocessed_genre
0,Creed III,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",dominate box world adonis creed thrive career ...,drama action
1,Avatar: The Way of Water,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",set decade event film learn story sully family...,science fiction adventure action
2,The Super Mario Bros. Movie,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",work underground fix water main brooklyn plumb...,animation adventure family fantasy ...
3,Mummies,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",series unfortunate event mummy end present day...,animation comedy family adventure ...
4,Supercell,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",good hearted teenager william live hope follow...,action


### Tokenize words

In [5]:
# Tfidf object
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['preprocessed_synopsis'])

### KMeans

In [6]:
# Kmeans object
kmeans = KMeans(n_clusters=5)  # Try diff n_clusters
kmeans.fit(tfidf_matrix)

# Cluster for movies
df['cluster_label'] = kmeans.labels_

### PCA

In [7]:
# reduce dimension
pca = PCA(n_components=2) #change n_components
reduced_features = pca.fit_transform(tfidf_matrix.toarray())

### LDA

In [8]:
lda = LDA(n_components=10)  # change n_components to see
lda.fit(tfidf_matrix)

LatentDirichletAllocation()

### Movie Recommender Function

In [9]:
def recommend_movies(keyword):
    # Clean up input
    keyword = preprocess(keyword)
    
    # Tokenize w/ tfidf
    keyword_vector = vectorizer.transform([keyword])
    
    # Predict cluster
    cluster = kmeans.predict(keyword_vector)[0]
    cluster_movies = df[df['cluster_label'] == cluster] #['names']
    
    cosine_similarities = cosine_similarity(keyword_vector, tfidf_matrix).flatten()
    
    top_five = cosine_similarities.argsort()[:-6:-1] #top 5 movies
    recommended_movies = df.iloc[top_five][['names', 'score', 'genre', 'overview']] #make sure to add genre scores synopsis
    
    return recommended_movies



In [14]:
recommend_movies('space')

Unnamed: 0,names,score,genre,overview
2798,Space Pirate Captain Harlock,66.0,"Animation, Science Fiction",Space Pirate Captain Harlock and his fearless ...
8828,Space Chimps,48.0,"Animation, Family, Adventure, Comedy, Science ...",Three chimps are sent into space to explore th...
9940,High Life,58.0,"Science Fiction, Drama, Mystery",A father and his daughter struggle to survive ...
6510,Lifeforce,63.0,"Horror, Science Fiction, Thriller",A space shuttle mission investigating Halley's...
3719,Gattaca,75.0,"Thriller, Science Fiction, Mystery, Romance",In a future society in the era of indefinite e...


### Evaluation

In [11]:
def evaluate_recommendation_system(query, ground_truth, recommend_function, k=5):
    """
    Evaluate the recommendation system for a given query using Precision@k and Recall@k.
    
    Parameters:
    - query: The query keyword used for generating recommendations.
    - ground_truth: A dictionary mapping queries to lists of relevant movie titles.
    - recommend_function: The function used to generate movie recommendations.
    - k: The number of top recommendations to consider for evaluation.
    
    Returns:
    - precision_k: Precision@k for the given query.
    - recall_k: Recall@k for the given query.
    """
    # Generate top-k recommendations for the query
    recommended_movies = recommend_function(query)
    recommended_titles = set(recommended_movies['names'].tolist())
    
    # Get the ground truth relevant movies for the query
    relevant_movies = set(ground_truth[query])
    
    # Calculate the number of relevant recommendations
    relevant_recommendations = recommended_titles.intersection(relevant_movies)
    num_relevant_recommendations = len(relevant_recommendations)
    
    # Calculate Precision@k and Recall@k
    precision_k = num_relevant_recommendations / k
    recall_k = num_relevant_recommendations / len(relevant_movies) if relevant_movies else 0
    
    return precision_k, recall_k

# Example usage:
ground_truth = {
     'basketball': ['Above the Rim', 'The Way Back', 'He Got Game', 'Kuroko\'s Basketball the Movie: Last Game', 'Triple Standard']  # Example ground truth  
}

query = 'basketball'
precision_k, recall_k = evaluate_recommendation_system(query, ground_truth, recommend_movies, k=5)
print(f"Precision@5: {precision_k}")
print(f"Recall@5: {recall_k}")


Precision@5: 1.0
Recall@5: 1.0


In [12]:
def evaluate_recommendation_system(query, ground_truth, recommend_function, k=5):
    """
    Evaluate the recommendation system for a given query using Precision@k and Recall@k.
    
    Parameters:
    - query: The query keyword used for generating recommendations.
    - ground_truth: A dictionary mapping queries to lists of relevant movie titles.
    - recommend_function: The function used to generate movie recommendations.
    - k: The number of top recommendations to consider for evaluation.
    
    Returns:
    - precision_k: Precision@k for the given query.
    - recall_k: Recall@k for the given query.
    """
    # Generate top-k recommendations for the query
    recommended_movies = recommend_function(query)
    recommended_titles = set(recommended_movies['names'].tolist())
    
    # Get the ground truth relevant movies for the query
    relevant_movies = set(ground_truth[query])
    
    # Calculate the number of relevant recommendations
    relevant_recommendations = recommended_titles.intersection(relevant_movies)
    num_relevant_recommendations = len(relevant_recommendations)
    
    # Calculate Precision@k and Recall@k
    precision_k = num_relevant_recommendations / k
    recall_k = num_relevant_recommendations / len(relevant_movies) if relevant_movies else 0
    
    return precision_k, recall_k

# Example usage:
ground_truth = {
    # 'basketball': ['Above the Rim', 'The Way Back', 'He Got Game', 'Kuroko\'s Basketball the Movie: Last Game', 'Triple Standard']  # Example ground truth
     'basketball': ['Creed III', 'Mummies', 'Supercell', 'Kuroko\'s Basketball the Movie: Last Game', 'Triple Standard']  # Example ground truth
}

query = 'basketball'
precision_k, recall_k = evaluate_recommendation_system(query, ground_truth, recommend_movies, k=5)
print(f"Precision@5: {precision_k}")
print(f"Recall@5: {recall_k}")


Precision@5: 0.4
Recall@5: 0.4
