In [179]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import ast

In [143]:
# Load your data
movies = pd.read_csv('../data/movies_dataset.csv')

In [144]:
# Convert columns from string to list
for col in ['genres', 'top_cast', 'keywords']:
    movies[col] = movies[col].apply(ast.literal_eval)

In [145]:
movies.head(1)

Unnamed: 0,budget,id,original_language,revenue,runtime,title,vote_average,vote_count,top_cast,director,genres,keywords,release_year,profit,decade
0,30000000.0,862,en,373554033.0,81.0,Toy Story,7.7,5415.0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...",1995,343554033.0,1990


In [170]:
def preprocess_movie_text(row):
    # Preprocesses the text features into a single feature to be used in NLP 
 
    # Cleaning and combining text features
    title = row['title'].lower().replace(' ', '_') if pd.notna(row['title']) else ''
    genres = ' '.join([g.lower().replace(' ', '_') for g in row['genres']])
    cast = ' '.join([actor.lower().replace(' ', '_') for actor in row['top_cast'][:3]])
    keywords = ' '.join([kw.lower().replace(' ', '_') for kw in row['keywords'][:5]])
    director = row['director'].lower().replace(' ', '_') if pd.notna(row['director']) else ''
    
    nlp = f"{title} {genres} {cast} {keywords} {director}"
    return nlp

movies['nlp'] = movies.apply(preprocess_movie_text, axis=1)

In [171]:
movies.head(1)

Unnamed: 0,budget,id,original_language,revenue,runtime,title,vote_average,vote_count,top_cast,director,genres,keywords,release_year,profit,decade,nlp
0,30000000.0,862,en,373554033.0,81.0,Toy Story,7.7,5415.0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...",1995,343554033.0,1990,toy_story animation comedy family tom_hanks ti...


In [172]:
# Display example of text preprocessing
print("Example movie nlp:")
for i in range(3):
    print(f"\nMovie: {movies.iloc[i]['title']}")
    print(f"NLP: {movies.iloc[i]['nlp']}")

Example movie nlp:

Movie: Toy Story
NLP: toy_story animation comedy family tom_hanks tim_allen don_rickles jealousy toy boy friendship friends john_lasseter

Movie: Jumanji
NLP: jumanji adventure fantasy family robin_williams jonathan_hyde kirsten_dunst board_game disappearance based_on_children's_book new_home recluse joe_johnston

Movie: Waiting to Exhale
NLP: waiting_to_exhale comedy drama romance whitney_houston angela_bassett loretta_devine based_on_novel interracial_relationship single_mother divorce chick_flick forest_whitaker


In [173]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['nlp'])


In [174]:
print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")

TF-IDF Matrix shape: (4940, 18151)
Number of features: 18151


In [151]:
feature_names = tfidf.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1
feature_importance = list(zip(feature_names, tfidf_scores))
feature_importance.sort(key=lambda x: x[1], reverse=True)

print("Top 20 TF-IDF features:")
for feature, score in feature_importance[:20]:
    print(f"{feature}: {score:.2f}")

Top 20 TF-IDF features:
drama: 188.19
comedy: 165.89
thriller: 147.52
action: 140.50
romance: 113.54
adventure: 110.86
crime: 105.81
science_fiction: 84.86
horror: 81.35
family: 78.56
fantasy: 74.22
mystery: 67.10
animation: 52.14
based_on_novel: 46.68
music: 36.23
war: 36.22
history: 34.15
independent_film: 29.12
sex: 26.83
duringcreditsstinger: 25.23


In [175]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [176]:
movies = movies.reset_index(drop=True)
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim, n=5):
    if title not in indices:
        return f"Movie '{title}' not found in dataset."
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Skip the first one (it's the movie itself)
    movie_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    # Build a DataFrame with the results and similarity scores
    result = movies[['title', 'genres', 'vote_average']].iloc[movie_indices].copy()
    result['similarity'] = scores
    return result

In [178]:
# Test the NLP recommendation system
test_movie = "The Dark Knight"
recommendations = get_recommendations(test_movie)
print(f"NLP-based recommendations for '{test_movie}':")
print(recommendations)

NLP-based recommendations for 'The Dark Knight':
               title                                         genres  \
2426   Batman Begins                         [Action, Crime, Drama]   
2689    The Prestige                     [Drama, Mystery, Thriller]   
442   Batman Returns                              [Action, Fantasy]   
202           Batman                              [Fantasy, Action]   
905         Superman  [Action, Adventure, Fantasy, Science Fiction]   

      vote_average  similarity  
2426           7.5    0.549407  
2689           8.0    0.296543  
442            6.6    0.179020  
202            7.0    0.169642  
905            6.9    0.167064  


In [180]:
# Saving the vectorizer
joblib.dump(tfidf, '../models/tfidf_vectorizer.joblib')

['../models/tfidf_vectorizer.joblib']

In [181]:
# New dataframes with combined features for NLP
movies.to_csv('../data/movies_with_nlp.csv', index=False)