In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

### V4 - Final

In [2]:
movies_df_raw = pd.read_csv('../datasets/movies_metadata.csv')
keywords_df = pd.read_csv('../datasets/keywords.csv')

  movies_df_raw = pd.read_csv('../datasets/movies_metadata.csv')


In [3]:
movies_columns = ['id', 'title', 'genres', 'popularity', 'vote_average', 'vote_count']
movies_df = movies_df_raw[movies_columns].copy()

In [4]:
keywords_df['id'] = keywords_df['id'].astype(str)
movies_keywords_df = movies_df.merge(keywords_df[['id', 'keywords']], on='id', how='left')
movies_keywords_df.head()

Unnamed: 0,id,title,genres,popularity,vote_average,vote_count,keywords
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",21.946943,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",17.015539,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",11.7129,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",3.859495,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",8.387519,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [5]:
def normalize_list_column(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: '{}' if pd.isna(x) else x)
    df[column_name] = movies_keywords_df[column_name].apply(lambda x: ' '.join([i['name'] for i in eval(x)]))

    return df

In [6]:
movies_keywords_df.dropna(inplace=True)
movies_keywords_df = normalize_list_column(movies_keywords_df, 'genres')
movies_keywords_df = normalize_list_column(movies_keywords_df, 'keywords')
movies_keywords_df.head()

Unnamed: 0,id,title,genres,popularity,vote_average,vote_count,keywords
0,862,Toy Story,Animation Comedy Family,21.946943,7.7,5415.0,jealousy toy boy friendship friends rivalry bo...
1,8844,Jumanji,Adventure Fantasy Family,17.015539,6.9,2413.0,board game disappearance based on children's b...
2,15602,Grumpier Old Men,Romance Comedy,11.7129,6.5,92.0,fishing best friend duringcreditsstinger old men
3,31357,Waiting to Exhale,Comedy Drama Romance,3.859495,6.1,34.0,based on novel interracial relationship single...
4,11862,Father of the Bride Part II,Comedy,8.387519,5.7,173.0,baby midlife crisis confidence aging daughter ...


In [7]:
movies_keywords_df = movies_keywords_df[(movies_df['vote_average'] > 0) & (movies_keywords_df['vote_count'] > 100)]
movies_keywords_df.shape

  movies_keywords_df = movies_keywords_df[(movies_df['vote_average'] > 0) & (movies_keywords_df['vote_count'] > 100)]


(5785, 7)

In [8]:
def calc_vote_weight(df):
    C = movies_df['vote_average'].mean()
    m = movies_df['vote_count'].quantile(0.80)

    R = movies_df['vote_average']
    v = movies_df['vote_count']

    return (R * v + C * m) / (v + m)

def scale_features(df, columns):
    scaler = MinMaxScaler()
    return scaler.fit_transform(df[columns])

def calc_score(df):
    df['vote_weight'] = calc_vote_weight(df)
    df[['popularity', 'vote_weight']] = scale_features(df, ['popularity', 'vote_weight'])
    df['score'] = df['popularity'] * 0.6 + df['vote_weight'] * 0.4
    return df

In [9]:
movies_keywords_df = calc_score(movies_keywords_df)

In [10]:
movies_keywords_df['text'] = movies_keywords_df['title'] + ' ' + movies_keywords_df['genres'] + ' ' + movies_keywords_df['keywords']
movies_keywords_df.head()

Unnamed: 0,id,title,genres,popularity,vote_average,vote_count,keywords,vote_weight,score,text
0,862,Toy Story,Animation Comedy Family,0.040078,7.7,5415.0,jealousy toy boy friendship friends rivalry bo...,0.849378,0.363798,Toy Story Animation Comedy Family jealousy toy...
1,8844,Jumanji,Adventure Fantasy Family,0.03107,6.9,2413.0,board game disappearance based on children's b...,0.697805,0.297764,Jumanji Adventure Fantasy Family board game di...
4,11862,Father of the Bride Part II,Comedy,0.015311,5.7,173.0,baby midlife crisis confidence aging daughter ...,0.473852,0.198728,Father of the Bride Part II Comedy baby midlif...
5,949,Heat,Action Crime Drama Thriller,0.032731,7.7,1886.0,robbery detective bank obsession chase shootin...,0.842857,0.356782,Heat Action Crime Drama Thriller robbery detec...
6,11860,Sabrina,Comedy Romance,0.012187,6.2,141.0,paris brother brother relationship chauffeur l...,0.542605,0.224354,Sabrina Comedy Romance paris brother brother r...


In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(text.split())

    return text

In [12]:
movies_keywords_df['text'] = movies_keywords_df['text'].apply(preprocess_text)
movies_keywords_df.head()

Unnamed: 0,id,title,genres,popularity,vote_average,vote_count,keywords,vote_weight,score,text
0,862,Toy Story,Animation Comedy Family,0.040078,7.7,5415.0,jealousy toy boy friendship friends rivalry bo...,0.849378,0.363798,toy story animation comedy family jealousy toy...
1,8844,Jumanji,Adventure Fantasy Family,0.03107,6.9,2413.0,board game disappearance based on children's b...,0.697805,0.297764,jumanji adventure fantasy family board game di...
4,11862,Father of the Bride Part II,Comedy,0.015311,5.7,173.0,baby midlife crisis confidence aging daughter ...,0.473852,0.198728,father of the bride part ii comedy baby midlif...
5,949,Heat,Action Crime Drama Thriller,0.032731,7.7,1886.0,robbery detective bank obsession chase shootin...,0.842857,0.356782,heat action crime drama thriller robbery detec...
6,11860,Sabrina,Comedy Romance,0.012187,6.2,141.0,paris brother brother relationship chauffeur l...,0.542605,0.224354,sabrina comedy romance paris brother brother r...


In [13]:
movies_keywords_df.reset_index(drop=True, inplace=True)

In [14]:
stop_words = set(stopwords.words('english'))
tfidf_vectorizer = TfidfVectorizer(stop_words=list(stop_words), ngram_range=(1, 2), max_features=10000)
tfidf_vectorizer_matrix = tfidf_vectorizer.fit_transform(movies_keywords_df['text'])

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_vectorizer_df = pd.DataFrame.sparse.from_spmatrix(tfidf_vectorizer_matrix, columns=feature_names, index=movies_keywords_df['id'])
tfidf_vectorizer_df.head()

Unnamed: 0_level_0,aaron,aaron swartz,abandoned,abandoned mine,abduction,ability,abortion,abraham,abraham lincoln,abroad,...,zombie virus,zombie werewolf,zombie zombie,zombies,zombies romance,zombification,zone,zoo,zookeeper,zulu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11862,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
949,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
similarity_scores = cosine_similarity(tfidf_vectorizer_matrix)

In [16]:
def find_similar_movies_score(movie_id, sim_scores, similarity_weight=0.7, max_results=5):
    movie_index = movies_keywords_df[movies_keywords_df['id'] == movie_id].index[0]
    similarity = sim_scores[movie_index]
    
    similarity_df = pd.DataFrame(similarity, index=movies_keywords_df.index, columns=['similarity'])
    
    final_df = movies_keywords_df.loc[similarity_df.index].copy()
    final_df['similarity'] = similarity_df['similarity']
    
    final_df['final_score'] = (final_df['score'] * (1 - similarity_weight) + final_df['similarity'] * similarity_weight)
    
    results = final_df.sort_values(by='final_score', ascending=False)
    results = results.iloc[1:max_results+1]
        
    return results[['id', 'title', 'similarity', 'score', 'final_score']]

### Resultados

In [17]:
movies_df[movies_df['id'] == '862']

Unnamed: 0,id,title,genres,popularity,vote_average,vote_count
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",21.946943,7.7,5415.0


In [18]:
find_similar_movies_score('862', similarity_scores)

Unnamed: 0,id,title,similarity,score,final_score
4933,256835,Toy Story That Time Forgot,0.576465,0.190972,0.460817
3680,10193,Toy Story 3,0.54156,0.224166,0.446342
1213,863,Toy Story 2,0.410603,0.234223,0.357689
4523,213121,Toy Story of Terror!,0.402396,0.189466,0.338517
705,11551,Small Soldiers,0.418589,0.128717,0.331627


In [19]:
movies_df[movies_df['id'] == '1726']

Unnamed: 0,id,title,genres,popularity,vote_average,vote_count
12588,1726,Iron Man,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na...",22.073099,7.4,8951.0


In [20]:
find_similar_movies_score('1726', similarity_scores)

Unnamed: 0,id,title,similarity,score,final_score
4388,68721,Iron Man 3,0.79598,0.220823,0.623433
3657,10138,Iron Man 2,0.694992,0.221311,0.552888
5660,76122,Marvel One-Shot: The Consultant,0.603672,0.189298,0.479359
4962,102899,Ant-Man,0.49162,0.213888,0.4083
5659,253980,Marvel One-Shot: All Hail the King,0.460199,0.190217,0.379204


In [21]:
# similarity_df = pd.DataFrame(
#     similarity_scores,
#     index=movies_keywords_df['id'],
#     columns=movies_keywords_df['id']
# )
# similarity_df.to_csv('../results/similarity_scores.csv')

In [22]:
# score_df = movies_keywords_df[['id', 'score']].copy()
# score_df.to_csv('../results/rating_scores.csv', index=False)