In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import json

file_path = '/content/drive/My Drive/similar_movies.json'

try:
    with open(file_path, 'r', encoding='utf8') as f:
        data = json.loads(f.read())
        df = pd.json_normalize(data)
        print(df)
except ValueError as e:
    print(f"ValueError: {e}")
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")

                                                2062  \
0  [9806, 260513, 920, 236493, 164945, 1036996, 6...   

                                              214756  \
0  [72105, 659613, 655797, 312100, 361472, 375848...   

                                              677179  \
0  [1206994, 480530, 312221, 1246, 79129, 15017, ...   

                                              324786  \
0  [227306, 755529, 480465, 101859, 1222891, 2732...   

                                                1865  \
0  [143335, 43721, 274857, 166426, 81005, 285, 97...   

                                             1311031  \
0  [1311035, 1311034, 724334, 587873, 1054116, 38...   

                                              475557  \
0  [889737, 398978, 629542, 1175942, 1301259, 432...   

                                             1172648  \
0  [1086967, 899792, 1042232, 253283, 567735, 254...   

                                               27205  \
0  [64956, 334931, 173931, 49026, 272, 4

In [None]:
features = ['title','release_year','overview','genres','keywords','actors','directors']

In [None]:
df['release_year'].isnull().values.any()

True

In [None]:
for feature in features:
    df[feature].fillna("", inplace=True)

  df[feature].fillna("", inplace=True)


In [None]:
def combine_features(row):
    title_weight = 0.5
    language_weight = 1.0
    origin_country_weight = 2.0
    genres_weight = 3.0
    keywords_weight = 3.0
    actors_weight = 3.0
    directors_weight = 2.0

    combined_features = (
        (row['title'] + " ") * int(title_weight * 10) +
        (row['language'] + " ") * int(language_weight * 10) +
        (row['origin_country'] + " ") * int(origin_country_weight * 10) +
        (row['genres'] + " ") * int(genres_weight * 10) +
        (row['keywords'] + " ") * int(keywords_weight * 10) +
        (row['actors'] + " ") * int(actors_weight * 10) +
        (row['directors'] + " ") * int(directors_weight * 10)
    )
    return combined_features

In [None]:
df = df.astype(str)
df['combined_features'] = df.apply(combine_features, axis = 1)

In [None]:
df = df.sort_values(by=['popularity'], ascending=False)

df = df[:50000]

df['index2'] = range(0, len(df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['index2'] = range(0, len(df))


In [None]:
print(df.columns.values)

['id' 'title' 'popularity' 'release_year' 'runtime' 'overview' 'language'
 'origin_country' 'genres' 'keywords' 'actors' 'directors'
 'background_url' 'image_url' 'combined_features' 'index2']


In [None]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix)
print(cosine_sim.shape)

(50000, 50000)


In [None]:
def get_title_from_index(index):
    return df[df.index2 == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index2"].values[0]
def get_id_from_index(index2):
    return df[df.index2==index2]["id"].values[0]

In [None]:
movie_user_likes = "Inception"
movie_index = get_index_from_title(movie_user_likes)
#print(cosine_sim[0])
similar_movies = list(enumerate(cosine_sim[movie_index])) #accessing the row corresponding to given movie to find all the similarity scores for that movie and then enumerating over it
print(similar_movies)

[(0, 0.02130114663007396), (1, 0.02213246288197022), (2, 0.016899073325033574), (3, 0.02094310413031818), (4, 0.010791841320934114), (5, 0.0), (6, 0.032256160570389804), (7, 0.0), (8, 1.0000000000000002), (9, 0.02195390393386933), (10, 0.00852605061441699), (11, 0.029952583604135337), (12, 0.02662924996914462), (13, 0.01897255476974685), (14, 0.023175643837021514), (15, 0.010143124559107342), (16, 0.005231958564560581), (17, 0.02222505466711336), (18, 0.017168104123567063), (19, 0.026068388418954355), (20, 0.029656575382794807), (21, 0.021972654333702223), (22, 0.009230400141352754), (23, 0.03249505982317126), (24, 0.028093048726948722), (25, 0.020119875083557538), (26, 0.015337135241064101), (27, 0.005839620347032955), (28, 0.019768393272643302), (29, 0.02941871865694807), (30, 0.04571503150090607), (31, 0.014032148534376598), (32, 0.0), (33, 0.0498137227918167), (34, 0.0), (35, 0.030677061500409237), (36, 0.018312264009214447), (37, 0.009916053414531495), (38, 0.0425772436212606), (3

In [None]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
print(sorted_similar_movies)

[(1748, 0.17995616417377616), (2048, 0.1641969944183512), (38098, 0.15270124455526954), (8912, 0.12326906544903547), (8949, 0.10787722109433089), (11237, 0.10606949407488739), (30619, 0.09933908461039774), (49006, 0.09658705318239479), (29782, 0.09351295571113638), (4001, 0.08637827676220591), (1832, 0.08346539456034771), (3644, 0.083456650176185), (39523, 0.08123435849770988), (28106, 0.08036437944664508), (47715, 0.07874837064046022), (3242, 0.07848909961345178), (42610, 0.07778510878702777), (32375, 0.0770674475511054), (23243, 0.07698346828474019), (8771, 0.07550413988806753), (40992, 0.07488801205382567), (46356, 0.07428622960220342), (46971, 0.07409205059621014), (46833, 0.0734987283841385), (23292, 0.0731091596885282), (44921, 0.07243917481744827), (39943, 0.0717048497297589), (24616, 0.0714598206184408), (35002, 0.07089622294179704), (26618, 0.07074407089791901), (39807, 0.07062897970130534), (48593, 0.0706037301364955), (40352, 0.07030986655171624), (40212, 0.06969560404207568

In [None]:
i=0
print("Top 10 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    print(get_id_from_index(element[0]))
    i=i+1
    if i>50:
        break

Top 10 similar movies to Inception are:

Inception: The Cobol Job
64956
Dreams: Cinema of the Subconscious
334931
兔侠传奇
173931
The Dark Knight Rises
49026
Batman Begins
272
Shackleton
429451
Untitled Peaky Blinders Film
875828
War Party
447436
Dream Demon
49343
The Matrix
603
iCarly: iGo to Japan
62459
Mon Clown
1038901
Finish Line
18787
Shelter
763762
Batman Begins - Behind the Story
1212370
TalhotBlond
159389
Inside Christopher Nolan's Oppenheimer
1152711
An American Dream
131861
REM
623183
Burn Your Maps
352504
Macbeth
133448
Graduation Night
729248
Ice
36979
Tsunami LA
773689
The Grasshopper
47646
Paint It Black
394692
Rush Hour 3
5174
Turbulence 2: Fear of Flying
76171
Tarantella
804706
7th Cavalry
43254
The Forgotten
10145
Joint Body
117944
Dare to Be Wild
348896
1917
530915
Count Yorga, Vampire
28733
The Illustrated Man
25874
Young Eagles
295848
Teenage Mutant Ninja Turtles: Enter Shredder
286102
Transformers: Rise of the Beasts 3
939347
Stranger Than Fiction: The True Story of W

In [None]:
# Dictionary to store movie ID and top 50 similar movie IDs
similar_movies_dict = {}

# Find top 50 similar movies for each movie
for index, row in df.iterrows():
    movie_id = row['id']
    movie_index = int(row['index2'])
    similar_movies = list(enumerate(cosine_sim[movie_index]))
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:51]
    similar_movie_ids = [get_id_from_index(element[0]) for element in sorted_similar_movies]
    similar_movies_dict[movie_id] = similar_movie_ids

    if movie_index%1000 == 0:
      print(movie_index)

# Save the dictionary as a JSON file
json_file_path = '/content/drive/My Drive/similar_movies.json'
with open(json_file_path, 'w') as json_file:
    json.dump(similar_movies_dict, json_file)

print(f"Top 50 similar movies for each movie saved to {json_file_path}")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
Top 50 similar movies for each movie saved to /content/drive/My Drive/similar_movies.json


In [None]:
# Convert DataFrame to the desired format
df_melted2 = df.melt(var_name='ID', value_name='similar_movies')

# Display the DataFrame
print(df_melted2.head())

# Convert DataFrame to list of dictionaries (JSON documents)
similar_movies_dict = df_melted2.to_dict(orient='records')

# Save the dictionary as a JSON file
json_file_path = '/content/drive/My Drive/similar_movies.json'
with open(json_file_path, 'w') as json_file:
    json.dump(similar_movies_dict, json_file, indent=2)