In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import json

file_path = '/content/drive/My Drive/complete-movies.json'

try:
    with open(file_path, 'r', encoding='utf8') as f:
        data = json.loads(f.read())
        df = pd.json_normalize(data)
        print(df)
except ValueError as e:
    print(f"ValueError: {e}")
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")

             id                               title  popularity  release_year  \
0          3924                             Blondie       5.636        1938.0   
1          6124                 Der Mann ohne Namen       1.434        1921.0   
2          8773                 L'Amour à vingt ans       5.028        1962.0   
3         25449  New World Disorder 9: Never Enough       1.550        2008.0   
4         31975      Sesame Street: Elmo Loves You!       0.004        2010.0   
...         ...                                 ...         ...           ...   
947684  1048544        '70 Remembering a Revolution       0.039        2010.0   
947685  1048545                               המרחק       0.889        1994.0   
947686  1048546                  A Significant Name       0.080        2021.0   
947687  1048547                         Abiturienti       1.452        2012.0   
947688  1048548                               日月潭之戀       0.590        1956.0   

        runtime            

In [5]:
print(df.columns.values)

['id' 'title' 'popularity' 'release_year' 'runtime' 'overview' 'language'
 'origin_country' 'genres' 'keywords' 'actors' 'directors'
 'background_url' 'image_url']


In [6]:
features = ['title','release_year','overview','genres','keywords','actors','directors']

In [7]:
df['release_year'].isnull().values.any()

True

In [8]:
for feature in features:
    df[feature].fillna("", inplace=True)

  df[feature].fillna("", inplace=True)


In [9]:
def combine_features(row):
    title_weight = 0.5
    language_weight = 1.0
    origin_country_weight = 1.0
    genres_weight = 3.0
    keywords_weight = 3.0
    actors_weight = 3.0
    directors_weight = 2.0

    combined_features = (
        (row['title'] + " ") * int(title_weight * 10) +
        (row['language'] + " ") * int(language_weight * 10) +
        (row['origin_country'] + " ") * int(origin_country_weight * 10) +
        (row['genres'] + " ") * int(genres_weight * 10) +
        (row['keywords'] + " ") * int(keywords_weight * 10) +
        (row['actors'] + " ") * int(actors_weight * 10) +
        (row['directors'] + " ") * int(directors_weight * 10)
    )
    return combined_features

In [10]:
df = df.astype(str)
df['combined_features'] = df.apply(combine_features, axis = 1)

In [11]:
df = df.sort_values(by=['popularity'], ascending=False)

df = df[:50000]

df['index2'] = range(0, len(df))

In [12]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [13]:
cosine_sim = cosine_similarity(tfidf_matrix)
print(cosine_sim.shape)

(50000, 50000)


In [18]:
def get_title_from_index(index):
    return df[df.index2 == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index2"].values[0]

In [19]:
movie_user_likes = "Inception"
movie_index = get_index_from_title(movie_user_likes)
#print(cosine_sim[0])
similar_movies = list(enumerate(cosine_sim[movie_index])) #accessing the row corresponding to given movie to find all the similarity scores for that movie and then enumerating over it
print(similar_movies)

[(0, 0.021149408414485597), (1, 0.02204686217274435), (2, 0.016784939711454165), (3, 0.020343294420856448), (4, 0.010689550920969057), (5, 0.0), (6, 0.032163427153382246), (7, 0.0), (8, 0.9999999999999998), (9, 0.02172103436338975), (10, 0.008357713662798914), (11, 0.029874241253431452), (12, 0.026506481721527815), (13, 0.018798713232274722), (14, 0.023067333714159165), (15, 0.009626374776932619), (16, 0.005239082758356784), (17, 0.022040498725837427), (18, 0.017047742859537366), (19, 0.02593654409215405), (20, 0.02951520398901907), (21, 0.021884268147767914), (22, 0.009067187215919498), (23, 0.03235012134562222), (24, 0.02791311848898923), (25, 0.01999980169752623), (26, 0.015189941509625242), (27, 0.005716376929860287), (28, 0.019643033767076164), (29, 0.0293143748033387), (30, 0.04564322613563369), (31, 0.013794587686107262), (32, 0.0), (33, 0.04973187221110189), (34, 0.0), (35, 0.030546930352782047), (36, 0.018159834424514527), (37, 0.009749721767066415), (38, 0.04248788977819685),

In [20]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
print(sorted_similar_movies)

[(1748, 0.1782535836268089), (2048, 0.16396766400739285), (38098, 0.15341185014739409), (8912, 0.12290299665958038), (8949, 0.10782328849554404), (11237, 0.10551484386650888), (30619, 0.09648519751418497), (49006, 0.09601278928016146), (29782, 0.09262257984236343), (4001, 0.08626835616188114), (3644, 0.08352455127359742), (1832, 0.083208455574475), (39523, 0.08084386039461063), (28106, 0.08049172126911362), (3242, 0.07829546040249886), (47715, 0.07823879272423415), (23243, 0.07714083971195537), (32375, 0.07685137718712697), (42610, 0.07662926370855151), (8771, 0.07533909295836508), (46356, 0.07410093781647403), (46971, 0.07392081847552931), (40992, 0.07336564894226988), (46833, 0.07305259009900575), (23292, 0.07291490733075562), (44921, 0.07229208794020822), (39943, 0.07159057690907085), (24616, 0.07133014849434119), (40352, 0.07080842619517302), (35002, 0.07058953699538839), (26618, 0.07048827237203066), (39807, 0.07048521696693398), (48593, 0.070252678228574), (40212, 0.0695860207599

In [21]:
i=0
print("Top 10 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>50:
        break

Top 10 similar movies to Inception are:

Inception: The Cobol Job
Dreams: Cinema of the Subconscious
兔侠传奇
The Dark Knight Rises
Batman Begins
Shackleton
Untitled Peaky Blinders Film
War Party
Dream Demon
The Matrix
Mon Clown
iCarly: iGo to Japan
Finish Line
Shelter
TalhotBlond
Batman Begins - Behind the Story
REM
An American Dream
Inside Christopher Nolan's Oppenheimer
Burn Your Maps
Graduation Night
Ice
Macbeth
Tsunami LA
The Grasshopper
Paint It Black
Rush Hour 3
Turbulence 2: Fear of Flying
Dare to Be Wild
Tarantella
7th Cavalry
The Forgotten
Joint Body
1917
Count Yorga, Vampire
The Illustrated Man
Young Eagles
Teenage Mutant Ninja Turtles: Enter Shredder
Stranger Than Fiction: The True Story of Whitey Bulger, Southie and 'The Departed'
Transformers: Rise of the Beasts 3
The Ganzfeld Haunting
Cabin Pressure
Transformers: Rise of the Beasts 2
Hulk vs. Wolverine
De Outro Sítio
Resonances
Galaxis
Easy Living
Live-Action Skibidi Toilet Film
London Road
Twice Upon a Time
