In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import pickle
import gzip



In [15]:
df = pd.read_csv(r'C:\Personal\Uni\CS\Programming with Python\ML-Based\Cine-API\data\df_v4.csv',encoding='utf-8')
df.head()

Unnamed: 0,id,original_title,cast,director,keywords,runtime,genres,vote_average,release_year
0,135397,jurassic world,Chris Pratt Bryce Dallas Howard Irrfan Khan Vi...,Colin Trevorrow,monster dna tyrannosaurus rex velociraptor island,124,Action Adventure Science Fiction Thriller,6.5,2015
1,76341,mad max fury road,Tom Hardy Charlize Theron Hugh Keays-Byrne Nic...,George Miller,future chase post-apocalyptic dystopia australia,120,Action Adventure Science Fiction Thriller,7.1,2015
2,262500,insurgent,Shailene Woodley Theo James Kate Winslet Ansel...,Robert Schwentke,based on novel revolution dystopia sequel dyst...,119,Adventure Science Fiction Thriller,6.3,2015
3,140607,star wars the force awakens,Harrison Ford Mark Hamill Carrie Fisher Adam D...,J.J. Abrams,android spaceship jedi space opera 3d,136,Action Adventure Science Fiction Fantasy,7.5,2015
4,168259,furious 7,Vin Diesel Paul Walker Jason Statham Michelle ...,James Wan,car race speed revenge suspense car,137,Action Crime Thriller,7.3,2015


MODEL v1.0.0


In [16]:
# Combine the selected text columns into a single 'content' column
df['content'] = df['original_title'] + ' ' + df['keywords'] + ' ' + df['genres'] + ' ' + df['director'] + ' ' + df['cast']


In [17]:
# Use TF-IDF Vectorizer to convert 'content' text into numerical vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])

# Calculate the cosine similarity between movies based on their content
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)


In [18]:
# Save the model (TF-IDF vectorizer and cosine similarities) as a pickle file
model_data = {
    'cosine_similarities': cosine_similarities
}

with gzip.open('model/content_based_model_v1.1.pkl.gz', 'wb') as model_file:
    pickle.dump(model_data, model_file)

MODEL v2.0.0


In [19]:
# Combine the selected text columns into a single 'content2' column
df['content2'] = df['original_title'] + ' ' + df['keywords'] + ' ' + df['genres'] + ' ' + df['director']+ ' ' + df['cast']

# Use TF-IDF Vectorizer to convert 'content2' text into numerical vectors
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english')
tfidf_matrix_text2 = tfidf_vectorizer2.fit_transform(df['content2'])


In [20]:
# Scale and include all numerical features


numerical_features = df[['runtime', 'vote_average', 'release_year']]
scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)
tfidf_matrix_combined = hstack([tfidf_matrix_text2, numerical_features_scaled])



In [21]:
# Calculate the cosine similarity between movies based on combined features
cosine_similarities2 = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)


In [22]:
# Save the model (TF-IDF vectorizer, scaler, and cosine similarities) as a pickle file
model_data = {
    'cosine_similarities': cosine_similarities2
}

with gzip.open('model/content_based_model_v2.pkl.gz', 'wb') as model_file:
    pickle.dump(model_data, model_file)

MODEL v2.1.0

In [23]:
# Combine the selected text columns into a single 'content2' column
df['content2'] = df['original_title'] + ' ' + df['keywords'] + ' ' + df['genres'] + ' ' + df['director'] + ' ' + df['cast']

# Use TF-IDF Vectorizer to convert 'content2' text into numerical vectors
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english')
tfidf_matrix_text2 = tfidf_vectorizer2.fit_transform(df['content2'])

# Scale numerical features
numerical_features = df[['runtime', 'vote_average', 'release_year']]
scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)

# Adjust weights for combining features
text_weight = 3  # Can adjust this weight based on the importance you want to give to text features




In [24]:
# Multiply the tf-idf matrix by the weight before combining
tfidf_matrix_combined = hstack([tfidf_matrix_text2 * text_weight, numerical_features_scaled])

# Calculate the cosine similarity between movies based on combined features
cosine_similarities2 = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)


MODAL v2.2.0

In [25]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import pickle
import gzip

df = pd.read_csv('data/df_v3.csv')

# Combine the selected text columns into a single 'content2' column
df['content2'] = df['original_title'] + ' ' + df['keywords'] + ' ' + df['genres'] + ' ' + df['director'] + ' ' + df['cast']

# Use TF-IDF Vectorizer to convert 'content2' text into numerical vectors
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english')
tfidf_matrix_text2 = tfidf_vectorizer2.fit_transform(df['content2'])

# Include 'original_title' in the TF-IDF features
original_title_tfidf = tfidf_vectorizer2.transform(df['original_title'])

# Scale numerical features
numerical_features = df[['runtime', 'vote_average', 'release_year']]
scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)

# Define weights for each column
column_weights = {
    'text': 2,  # Weight for text features
    'original_title': 1.8,  # Weight for 'original_title'
    'keywords': 1,  # Weight for 'keywords'
    'genres': 1,  # Weight for 'genres'
    'director': 1,  # Weight for 'director'
    'cast': 1,  # Weight for 'cast'
    'runtime': 0.5,  # Weight for 'runtime'
    'vote_average': 0.5,  # Weight for 'vote_average'
    'release_year': 0.5  # Weight for 'release_year'
}

# Apply weights to each text feature
weighted_text_matrices = {
    'original_title': original_title_tfidf * column_weights['original_title'],
    'keywords': tfidf_matrix_text2[:, tfidf_vectorizer2.vocabulary_.get('keywords', 0)] * column_weights['text'],
    'genres': tfidf_matrix_text2[:, tfidf_vectorizer2.vocabulary_.get('genres', 0)] * column_weights['text'],
    'director': tfidf_matrix_text2[:, tfidf_vectorizer2.vocabulary_.get('director', 0)] * column_weights['text'],
    'cast': tfidf_matrix_text2[:, tfidf_vectorizer2.vocabulary_.get('cast', 0)] * column_weights['text'],
}

# Combine the text features
weighted_text_matrix = hstack([weighted_text_matrices[col] for col in weighted_text_matrices])

# Apply weights to each numerical feature
weighted_numerical_matrix = numerical_features_scaled * column_weights['runtime']

# Combine all features
tfidf_matrix_combined = hstack([weighted_text_matrix, weighted_numerical_matrix])

# Calculate the cosine similarity between movies based on combined features
cosine_similarities2 = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)

# Save the model (TF-IDF vectorizer, scaler, and cosine similarities) as a pickle file
model_data = {
    'cosine_similarities': cosine_similarities2,
    'tfidf_vectorizer': tfidf_vectorizer2,
    'scaler': scaler,
    'column_weights': column_weights
}

with gzip.open('model/content_based_model_v2.pkl.gz', 'wb') as model_file:
    pickle.dump(model_data, model_file)


FileNotFoundError: [Errno 2] No such file or directory: 'data/df_v3.csv'

In [None]:
# Combine the selected text columns into a single 'content2' column
df['content2'] = df['original_title'] + ' ' + df['keywords'] + ' ' + df['genres'] + ' ' + df['director'] + ' ' + df['cast']

# Use TF-IDF Vectorizer to convert 'content2' text into numerical vectors
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english')
tfidf_matrix_text2 = tfidf_vectorizer2.fit_transform(df['content2'])

# Include 'original_title' in the TF-IDF features
original_title_tfidf = tfidf_vectorizer2.transform(df['original_title'])

# Scale numerical features
numerical_features = df[['runtime', 'vote_average', 'release_year']]
scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)



In [None]:
# Print the vocabulary
print(tfidf_vectorizer2.vocabulary_)

{'jurassic': 6175, 'world': 12844, 'monster': 8047, 'dna': 3260, 'tyrannosaurus': 12157, 'rex': 9829, 'velociraptor': 12332, 'island': 5837, 'action': 128, 'adventure': 182, 'science': 10438, 'fiction': 4097, 'thriller': 11806, 'colin': 2390, 'trevorrow': 12054, 'chris': 2218, 'pratt': 9296, 'bryce': 1685, 'dallas': 2843, 'howard': 5532, 'irrfan': 5819, 'khan': 6367, 'vincent': 12404, 'onofrio': 8634, 'nick': 8402, 'robinson': 9962, 'mad': 7233, 'max': 7561, 'fury': 4418, 'road': 9945, 'future': 4422, 'chase': 2120, 'post': 9256, 'apocalyptic': 549, 'dystopia': 3522, 'australia': 773, 'george': 4584, 'miller': 7891, 'tom': 11903, 'hardy': 5112, 'charlize': 2115, 'theron': 11769, 'hugh': 5562, 'keays': 6291, 'byrne': 1792, 'nicholas': 8398, 'hoult': 5520, 'josh': 6115, 'helman': 5265, 'insurgent': 5762, 'based': 1007, 'novel': 8508, 'revolution': 9825, 'sequel': 10550, 'dystopic': 3523, 'robert': 9956, 'schwentke': 10434, 'shailene': 10611, 'woodley': 12820, 'theo': 11761, 'james': 5911

In [None]:
# Define weights for each column
column_weights = {
    'text': 2,  # Weight for text features
    'original_title': 1.8,  # Weight for 'original_title'
    'keywords': 1.5,  # Weight for 'keywords'
    'genres': 1.5,  # Weight for 'genres'
    'director': 0.8,  # Weight for 'director'
    'cast': 3,  # Weight for 'cast'
    'runtime': 0.5,  # Weight for 'runtime'
    'vote_average': 0.5,  # Weight for 'vote_average'
    'release_year': 0.3  # Weight for 'release_year'
}

# Apply weights to each text feature
weighted_text_matrices = {
    col: tfidf_matrix_text2[:, tfidf_vectorizer2.vocabulary_[col]] * column_weights['text']
    for col in ['original_title', 'keywords', 'genres', 'director', 'cast']
}



KeyError: 'original_title'

In [None]:
# Combine the text features
weighted_text_matrix = hstack([weighted_text_matrices[col] for col in weighted_text_matrices])

# Apply weights to each numerical feature
weighted_numerical_matrix = numerical_features_scaled * column_weights['runtime']

# Combine all features
tfidf_matrix_combined = hstack([weighted_text_matrix, weighted_numerical_matrix])

# Calculate the cosine similarity between movies based on combined features
cosine_similarities2 = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)


Saving model to disk

In [None]:
# Save the model (TF-IDF vectorizer, scaler, and cosine similarities) as a pickle file
model_data = {
    'cosine_similarities': cosine_similarities2,
}

with gzip.open('model/content_based_model_v2.1.pkl.gz', 'wb') as model_file:
    pickle.dump(model_data, model_file)


Functions


In [None]:
from fuzzywuzzy import process, fuzz

In [None]:
def search_movie_title(query, movie_titles):
    # Extract all matches without a score cutoff
    matches = process.extract(query, movie_titles, scorer=fuzz.ratio)

    # Filter matches based on score cutoff
    matches = [match for match in matches if match[1] >= 60]

    # Return the best match or None
    if matches:
        return matches[0][0]  # Return only the match string
    else:
        return None



In [None]:
# TESTING SEARCH MOVIE TITLE
title = 'spd man'
matched_title = search_movie_title(title, df['original_title'])
print(matched_title)

In [None]:
def get_content_based_recommendations(movie_title, model_data, df):
    # making movie title to be lowercase
    movie_title = movie_title.lower()

    cosine_similarities = model_data['cosine_similarities']

    # Use fuzzy matching to find the closest match to the input movie title
    matched_title = search_movie_title(movie_title, df['original_title'])

    # Find the index of the given movie title in the dataframe
    movie_index = df[df['original_title'] == matched_title].index[0]
    # Calculate cosine similarities with other movies
    similar_movies = list(enumerate(cosine_similarities[movie_index]))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    similar_movies = similar_movies[:5]  # Get the top 5 recommendations

    # Get movie names
    movie_names = [df.iloc[i[0]]['original_title'] for i in similar_movies]

    return movie_names

TESTING MODAL 1

In [None]:
with gzip.open('model/content_based_model_v1.1.pkl.gz', 'rb') as f:
    cosine_similarity_matrix = pickle.load(f)
    


In [None]:
# Assuming 'df' is your dataset, replace 'Inception' with the movie title you want recommendations for
movie_title = 'spiderman'
recommendations = get_content_based_recommendations(movie_title, cosine_similarity_matrix, df)

# Display the recommendations
print(f"Recommendations for '{movie_title}':")
for movie in recommendations:
    print(movie)

TESTING MODAL 2

In [None]:
with gzip.open('model/content_based_model_v2.pkl.gz', 'rb') as f:
    cosine_similarity_matrix2 = pickle.load(f)
    


In [None]:
# Assuming 'df' is your dataset, replace 'Inception' with the movie title you want recommendations for
movie_title = 'spiderman'
recommendations = get_content_based_recommendations(movie_title, cosine_similarity_matrix2, df)

# Display the recommendations
print(f"Recommendations for '{movie_title}':")
for movie in recommendations:
    print(movie)

NameError: name 'get_content_based_recommendations' is not defined

In [None]:
# # Replace 'content_based_model.pkl' with the actual path to your saved model file
# model_file_path = "model/content_based_model_v2.pkl"
# model_data = load_content_based_model(model_file_path)

# # Assuming 'df' is your dataset, replace 'Inception' with the movie title you want recommendations for
# movie_title = 'Titanic'
# recommendations = get_content_based_recommendations(movie_title, model_data, df)

# # Display the recommendations
# print(f"Recommendations for '{movie_title}':")
# if not recommendations:
#     print('No recommendations found')
# else:
#     for movie in recommendations:
#         print(movie)


In [None]:
"""
model_file_path = "model/content_based_model_v2.pkl"

seems to give overall better results.
i think it is because the model is trained on just the text while the other model is trained on the text and the numerical features. outlieres seem to be affecting the model.


"""

In [None]:
# Reading data
df = pd.read_csv('data/df_v2.csv')

# Importing model
with open('model\content_based_model_v1.pkl', 'rb') as f:
    cosine_similarity_matrix = pickle.load(f)


def search_movie_title(query, movie_titles):
    # Extract all matches without a score cutoff
    matches = process.extract(query, movie_titles, scorer=fuzz.ratio)

    # Filter matches based on score cutoff
    matches = [match for match in matches if match[1] >= 60]

    # Return the best match or None
    if matches:
        return matches[0][0]  # Return only the match string
    else:
        raise Exception('No matches found')

def get_content_based_recommendations(movie_title, model_data, df):
    # making movie title to be lowercase
    movie_title = movie_title.lower()

    cosine_similarities = model_data['cosine_similarities']

    # Use fuzzy matching to find the closest match to the input movie title
    matched_title = search_movie_title(movie_title, df['original_title'])
    
    print(matched_title)

    # Find the index of the given movie title in the dataframe
    movie_index = df[df['original_title'] == matched_title].index[0]
    # Calculate cosine similarities with other movies
    similar_movies = list(enumerate(cosine_similarities[movie_index]))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    similar_movies = similar_movies[:5]  # Get the top 5 recommendations

    # Get movie names
    movie_names = [df.iloc[i[0]]['original_title'] for i in similar_movies]

    return movie_names



In [None]:

movie_title = 'spiderman'

recommendations = get_content_based_recommendations(movie_title, cosine_similarity_matrix, df)

# Display the recommendations
print(f"Recommendations for '{movie_title}':")
for movie in recommendations:
    print(movie)