In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import json

file_path = '/content/drive/My Drive/complete-tv.json'

try:
    with open(file_path, 'r', encoding='utf8') as f:
        data = json.loads(f.read())
        df = pd.json_normalize(data)
        print(df)
except ValueError as e:
    print(f"ValueError: {e}")
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")

            id                                              title  popularity  \
0            1                                               プライド      70.679   
1            2                                             Clerks     150.847   
2            3                                        The Message      21.368   
3            4                          The Amazing Mrs Pritchard      68.590   
4            5                                             La Job      17.123   
...        ...                                                ...         ...   
176844  260182        Nom d'un chien ! Le plus beau c'est le mien       0.680   
176845  260183  Looney Tunes Presents: Sports Talk With Bugs B...       1.400   
176846  260184                                              大汉的天空       1.021   
176847  260185                                   Schiphol Airport       1.400   
176848  260186                                       우당탕탕 하찮은 동창회       0.600   

                           

In [6]:
df.tail()

Unnamed: 0,id,title,popularity,overview,genres,language,origin_country,keywords,actors,directors,background_url,image_url,seasons,first_aired.$date,last_aired.$date,first_aired,last_aired
176844,260182,Nom d'un chien ! Le plus beau c'est le mien,0.68,,"[Reality, Family]",fr,[BE],[],"[Ludovic Daxhelet, Sandrine Dans, Jean-Françoi...",[],http://image.tmdb.org/t/p/original/ApZeLJuSqsy...,http://image.tmdb.org/t/p/original/mbIUbI5npIp...,"[{'season_number': 1, 'episodes': [{'title': '...",2022-10-15T00:00:00,2022-11-05T00:00:00,,
176845,260183,Looney Tunes Presents: Sports Talk With Bugs B...,1.4,Bugs Bunny meets his favorite athletes to lear...,"[Animation, Talk]",en,[US],[],[Eric Bauza],[],http://image.tmdb.org/t/p/original/1TenNT3NAbc...,http://image.tmdb.org/t/p/original/svXzRFIwmp2...,"[{'season_number': 1, 'episodes': [{'title': '...",2024-07-26T00:00:00,2024-07-27T00:00:00,,
176846,260184,大汉的天空,1.021,,[Documentary],zh,[CN],[],[],[],,http://image.tmdb.org/t/p/original/oAlGZNDDQTs...,"[{'season_number': 1, 'episodes': [{'title': '...",2024-06-26T00:00:00,2024-06-26T00:00:00,,
176847,260185,Schiphol Airport,1.4,,,nl,[NL],[],[Ewout Genemans],[],http://image.tmdb.org/t/p/original/2nKPBM20vLE...,,"[{'season_number': 1, 'episodes': [{'title': '...",2024-08-27T00:00:00,,,
176848,260186,우당탕탕 하찮은 동창회,0.6,"""The crazy reunion of friends who haven't seen...",,ko,[KR],[],[],[],,http://image.tmdb.org/t/p/original/sRTAWTAXYD0...,"[{'season_number': 1, 'episodes': [{'title': '...",2020-04-17T00:00:00,2020-04-17T00:00:00,,


In [1]:
features = ['title','genres','keywords','actors','directors','first_aired','language']

In [20]:
df['language'].isnull().values.any()

False

In [4]:
for feature in features:
    df[feature].fillna("", inplace=True)

  df[feature].fillna("", inplace=True)


In [5]:
def combine_features(row):
    title_weight = 0.5
    language_weight = 1.0
    first_aired_weight = 2.0
    genres_weight = 3.0
    keywords_weight = 3.0
    actors_weight = 3.0
    directors_weight = 2.0

    combined_features = (
        (row['title'] + " ") * int(title_weight * 10) +
        (row['language'] + " ") * int(language_weight * 10) +
        (row['first_aired'] + " ") * int(first_aired_weight * 10) +
        (row['genres'] + " ") * int(genres_weight * 10) +
        (row['keywords'] + " ") * int(keywords_weight * 10) +
        (row['actors'] + " ") * int(actors_weight * 10) +
        (row['directors'] + " ") * int(directors_weight * 10)
    )
    return combined_features

In [6]:
df = df.astype(str)
df['combined_features'] = df.apply(combine_features, axis = 1)

In [29]:
df.head()

Unnamed: 0,id,title,popularity,overview,genres,language,origin_country,keywords,actors,directors,background_url,image_url,seasons,first_aired.$date,last_aired.$date,first_aired,last_aired,combined_features,index2
29667,32058,Great British Railway Journeys,997.411,Michael Portillo takes to the tracks with a co...,['Documentary'],en,['GB'],['railroad'],['Michael Portillo'],['Ben Rowland'],http://image.tmdb.org/t/p/original/zetjn4dAGs3...,http://image.tmdb.org/t/p/original/h9XrDjo4y8W...,"[{'season_number': 1, 'episodes': [{'title': '...",2010-01-04T00:00:00,2024-04-05T00:00:00,,,Great British Railway Journeys Great British R...,0
36295,39297,Last Man Standing,994.219,A married father of three tries to maintain hi...,['Comedy'],en,['US'],"['marketing', 'family', 'sitcom', 'family come...","['Tim Allen', 'Nancy Travis', 'Hector Elizondo...",[],http://image.tmdb.org/t/p/original/3xU0vpr9QJY...,http://image.tmdb.org/t/p/original/ofUvbcwxcA0...,"[{'season_number': 1, 'episodes': [{'title': '...",2011-10-11T00:00:00,2021-05-20T00:00:00,,,Last Man Standing Last Man Standing Last Man S...,1
71433,82873,The Kelly Clarkson Show,992.534,Kelly Clarkson presents the biggest newsmakers...,['Talk'],en,['US'],"['talk show', 'singer']",['Kelly Clarkson'],[],http://image.tmdb.org/t/p/original/6BGQ39cH2hf...,http://image.tmdb.org/t/p/original/9Gg1oM8Us8g...,"[{'season_number': 1, 'episodes': [{'title': '...",2019-09-09T00:00:00,2024-06-21T00:00:00,,,The Kelly Clarkson Show The Kelly Clarkson Sho...,2
7371,7897,Cuéntame cómo pasó,992.233,Recounts the experiences of a middle-class fam...,"['Comedy', 'Drama']",es,['ES'],"['spain', 'madrid, spain', 'dictatorship', 'fa...","['Imanol Arias', 'Ana Duato', 'María Galiana',...",[],http://image.tmdb.org/t/p/original/gS67p2XWTpg...,http://image.tmdb.org/t/p/original/1OjgVLawOZz...,"[{'season_number': 1, 'episodes': [{'title': '...",2001-09-13T00:00:00,2023-11-29T00:00:00,,,Cuéntame cómo pasó Cuéntame cómo pasó Cuéntame...,3
2913,3051,Lost in Space,991.052,The space family Robinson is sent on a five-ye...,"['Sci-Fi & Fantasy', 'Comedy']",en,['US'],"['spacecraft', 'future', 'space', 'alien planet']","['Guy Williams', 'June Lockhart', 'Mark Goddar...",[],http://image.tmdb.org/t/p/original/upqfQF1LBPf...,http://image.tmdb.org/t/p/original/dGhtCpYZutu...,"[{'season_number': 1, 'episodes': [{'title': '...",1965-09-15T00:00:00,1968-03-06T00:00:00,,,Lost in Space Lost in Space Lost in Space Lost...,4


In [7]:
df = df.sort_values(by=['popularity'], ascending=False)

df = df[:50000]

df['index2'] = range(0, len(df))

In [25]:
print(df.columns.values)

['id' 'title' 'popularity' 'overview' 'genres' 'language' 'origin_country'
 'keywords' 'actors' 'directors' 'background_url' 'image_url' 'seasons'
 'first_aired.$date' 'last_aired.$date' 'first_aired' 'last_aired'
 'combined_features' 'index2']


In [12]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [13]:
cosine_sim = cosine_similarity(tfidf_matrix)
print(cosine_sim.shape)

(50000, 50000)


In [14]:
def get_title_from_index(index):
    return df[df.index2 == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index2"].values[0]
def get_id_from_index(index2):
    return df[df.index2==index2]["id"].values[0]

In [15]:
tv_user_likes = "Lost in Space"
tv_index = get_index_from_title(tv_user_likes)
#print(cosine_sim[0])
similar_tvs = list(enumerate(cosine_sim[tv_index])) #accessing the row corresponding to given movie to find all the similarity scores for that movie and then enumerating over it
print(similar_tvs)

[(0, 0.0007917667976621143), (1, 0.011134782752442727), (2, 0.0007129244821236228), (3, 0.0038607145957263314), (4, 1.0000000000000004), (5, 0.05048278852036599), (6, 0.0077035187639989495), (7, 0.0), (8, 0.0), (9, 0.006483219100920593), (10, 0.004002777178961475), (11, 0.01533249710046999), (12, 0.0), (13, 0.01058885628408386), (14, 0.006492588099201234), (15, 0.02579632763673247), (16, 0.003699195247461184), (17, 0.0), (18, 0.0), (19, 0.006542782541299322), (20, 0.0052338655850944025), (21, 0.013826522458047245), (22, 0.04009958247368121), (23, 0.028613993965299882), (24, 0.0), (25, 0.03344668769352769), (26, 0.0016667321604049575), (27, 0.009684521207440676), (28, 0.006034772191860992), (29, 0.006032094854334051), (30, 0.0), (31, 0.0025387686264909167), (32, 0.0), (33, 0.0), (34, 0.0003780176014374229), (35, 0.00598335647665132), (36, 0.005707479666131581), (37, 0.0002583103801556059), (38, 0.0), (39, 0.0), (40, 0.0034430148186959573), (41, 0.0002985525447889906), (42, 0.0), (43, 0.

In [16]:
sorted_similar_tvs = sorted(similar_tvs,key=lambda x:x[1],reverse=True)[1:]
print(sorted_similar_tvs)

[(35314, 0.29251491314647005), (604, 0.2645877942070238), (28940, 0.2597846809479647), (12139, 0.23902500636764942), (20548, 0.23501418853150038), (45118, 0.22989340787737628), (1483, 0.20632781572091413), (8386, 0.2051792492098893), (33682, 0.2030924864518733), (17385, 0.20301799070583737), (18338, 0.19999722849916732), (3866, 0.19964440665012181), (36310, 0.19729481448333702), (49578, 0.19397548609305784), (30892, 0.1923432589179921), (32411, 0.19078016452923946), (212, 0.1888976774974504), (30848, 0.18232204729923746), (43508, 0.18232204729923746), (15496, 0.18051352579756993), (34833, 0.17941115497897434), (19227, 0.1781735189047752), (15541, 0.17615286624748014), (28560, 0.17604625359047016), (43474, 0.17434354406375982), (49815, 0.17427601780573368), (1913, 0.17305802635239606), (26992, 0.17275948867503743), (44906, 0.17262195950869183), (411, 0.17197625765398564), (20549, 0.1717825752705884), (24024, 0.17062914286373734), (23311, 0.16960265617623932), (44786, 0.169477328306197),

In [17]:
i=0
print("Top 50 similar tvs to "+tv_user_likes+" are:\n")
for element in sorted_similar_tvs:
    print(get_title_from_index(element[0]))
    print(get_id_from_index(element[0]))
    i=i+1
    if i>50:
        break

Top 50 similar tvs to Lost in Space are:

Far Out Space Nuts
3042
Star Trek: Voyager
1855
Dans une galaxie près de chez vous
13706
The Expanse
63639
Space Precinct
3129
TVography
36768
Star Trek: Enterprise
314
Born Naughty?
222317
Mark Williams' Big Bangs
28857
Aurora
203144
Mysteries of the Universe: Our Solar System
109312
Red Planet
16910
Futurama
615
Ｖ
243072
Other Space
62383
The Changes
9498
Star Trek: Deep Space Nine
580
白狐的人生
95421
特派幸福
118983
Star Wars Blips
105106
Duck Dodgers
216
Babylon 5
3137
Something Wicked
248350
The Fantastic Four
2373
Alien Highway
90278
Bobcat Goldthwait's Misfits & Monsters
80286
The Planets
6335
Star Trek
253
Haunted USA
123853
Earth 2
1473
The Whistler
22851
Silent Eye
117346
Under the Skin
116325
Captain Disillusion
221726
UFO
2560
Stranger in a Strange Land
71911
跨越千年来爱你
246554
Tales of the Tardis
238486
A Thousand Suns
251882
Trisha Goddard
359
Troopers: Animated
245017
Battlestar Galactica: Blood & Chrome
33240
美少女遊撃隊バトルスキッパー
96292
Beasts
520

In [18]:
# Dictionary to store movie ID and top 50 similar movie IDs
similar_tvs_dict = {}
i = 0
# Find top 50 similar movies for each movie
for index, row in df.iterrows():
    tv_id = row['id']
    tv_index = int(row['index2'])
    similar_tvs = list(enumerate(cosine_sim[tv_index]))
    sorted_similar_tvs= sorted(similar_tvs, key=lambda x: x[1], reverse=True)[1:51]
    similar_tv_ids = [get_id_from_index(element[0]) for element in sorted_similar_tvs]
    similar_tvs_dict[tv_id] = similar_tv_ids
    print(similar_tv_ids)
    if i == 5:
      break
    i+=1

    if tv_index%1000 == 0:
      print(tv_index)

# Save the dictionary as a JSON file
json_file_path = '/content/drive/My Drive/similar_tvs.json'
with open(json_file_path, 'w') as json_file:
    json.dump(similar_tvs_dict, json_file)

print(f"Top 50 similar tvs for each movie saved to {json_file_path}")

['251852', '157259', '102509', '156342', '73785', '93408', '37287', '74542', '15857', '76745', '18438', '128430', '58535', '111836', '92224', '195687', '195678', '64629', '194592', '87406', '86523', '85472', '65081', '30872', '17431', '130234', '194574', '195680', '20979', '195689', '216195', '157197', '98730', '117242', '18624', '4994', '195677', '255407', '135821', '14140', '137846', '105256', '235834', '215408', '52498', '73543', '109536', '47888', '100159', '229240']
0
['3791', '46796', '26690', '93470', '214620', '93708', '83038', '92172', '233351', '208063', '1146', '118733', '212999', '207136', '7995', '107422', '97193', '3392', '209941', '24137', '81841', '83136', '214720', '68345', '12403', '24401', '82679', '87296', '11618', '649', '46410', '100008', '10447', '20810', '70089', '25219', '14729', '225875', '595', '98770', '109056', '93670', '108971', '237391', '122438', '41633', '79786', '91701', '72592', '61714']
['213512', '59021', '70890', '26422', '212853', '42576', '64189'

In [40]:
# Convert DataFrame to the desired format
df = pd.DataFrame(similar_tvs_dict)
df_melted2 = df.melt(var_name='ID', value_name='similar_tvshows')

# Display the DataFrame
print(df_melted2.head())

# Convert DataFrame to list of dictionaries (JSON documents)
similar_tvs_dict = df_melted2.to_dict(orient='records')

# Save the dictionary as a JSON file
json_file_path = '/content/drive/My Drive/similar_tvs.json'
with open(json_file_path, 'w') as json_file:
    json.dump(similar_tvs_dict, json_file, indent=2)

      ID similar_tvshows
0  32058          251852
1  32058          157259
2  32058          102509
3  32058          156342
4  32058           73785


In [21]:
import json

# Load the original JSON file
with open('/content/drive/My Drive/similar_tvs (1).json', 'r') as json_file:
    similar_tvs_dict = json.load(json_file)

# Convert to the desired format
formatted_data = [
    {
        "_id": key,
        "similar_content": value
    } for key, value in similar_tvs_dict.items()
]

# Save the formatted data to a new JSON file
with open('/content/drive/My Drive/formatted_similar_tvs.json', 'w') as json_file:
    json.dump(formatted_data, json_file, indent=4)