In [2]:
!pip install pandas
!pip install tqdm

Collecting pandas
  Downloading pandas-2.2.0-cp39-cp39-win_amd64.whl (11.6 MB)
Collecting tzdata>=2022.7
  Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Collecting numpy<2,>=1.22.4; python_version < "3.11"
  Downloading numpy-1.26.4-cp39-cp39-win_amd64.whl (15.8 MB)
Collecting pytz>=2020.1
  Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Installing collected packages: tzdata, numpy, pytz, pandas
Successfully installed numpy-1.26.4 pandas-2.2.0 pytz-2024.1 tzdata-2024.1


You should consider upgrading via the 'c:\users\admin\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


Collecting tqdm
  Using cached tqdm-4.66.2-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.2


You should consider upgrading via the 'c:\users\admin\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [66]:
import pandas as pd
import ast
from tqdm import tqdm

In [67]:
# Load the datasets
movies_df = pd.read_csv('tmdb_5000_movies.csv')
credits_df = pd.read_csv('tmdb_5000_credits.csv')

# Display the first few rows of each dataframe to understand their structure
movies_df.shape, credits_df.shape

((4803, 20), (4803, 4))

In [68]:
merged_df = pd.merge(movies_df, credits_df, left_on='id', right_on='movie_id')

In [69]:
# Load the newly uploaded datasets
keywords_df = pd.read_csv('keywords.csv')
links_df = pd.read_csv('links.csv')
movies_metadata_df = pd.read_csv('movies_metadata.csv', low_memory=False) # low_memory to avoid dtypes warning

# Display the first few rows of each new dataframe to understand their structure
keywords_df.shape, links_df.shape, movies_metadata_df.shape


((46419, 2), (45843, 3), (45466, 24))

In [70]:
# Merge the keywords_df with the previously merged dataset
merged_with_keywords_df = pd.merge(merged_df, keywords_df, on='id', how='left')

# Now merge the movies_metadata_df with the result, aligning on 'id'
# We need to ensure the 'id' columns are of the same type
merged_with_keywords_df['id'] = merged_with_keywords_df['id'].astype(str)
movies_metadata_df['id'] = movies_metadata_df['id'].astype(str)

final_merged_df = pd.merge(merged_with_keywords_df, movies_metadata_df, on='id', how='left', suffixes=('_left', '_right'))

# Display the shape of the final merged dataframe to understand the extent of merging
final_merged_df.shape, final_merged_df.columns

((4852, 48),
 Index(['budget_left', 'genres_left', 'homepage_left', 'id', 'keywords_x',
        'original_language_left', 'original_title_left', 'overview_left',
        'popularity_left', 'production_companies_left',
        'production_countries_left', 'release_date_left', 'revenue_left',
        'runtime_left', 'spoken_languages_left', 'status_left', 'tagline_left',
        'title_x', 'vote_average_left', 'vote_count_left', 'movie_id',
        'title_y', 'cast', 'crew', 'keywords_y', 'adult',
        'belongs_to_collection', 'budget_right', 'genres_right',
        'homepage_right', 'imdb_id', 'original_language_right',
        'original_title_right', 'overview_right', 'popularity_right',
        'poster_path', 'production_companies_right',
        'production_countries_right', 'release_date_right', 'revenue_right',
        'runtime_right', 'spoken_languages_right', 'status_right',
        'tagline_right', 'title', 'video', 'vote_average_right',
        'vote_count_right'],
       dt

In [6]:
# final_merged_df.to_csv('merged_movies.csv',index=False)

In [71]:
# Identify columns with '_left' and '_right' suffixes and prepare for comparison and potential consolidation
duplicate_columns = [col for col in final_merged_df.columns if '_left' in col or '_right' in col]
unique_columns = [col.replace('_left', '').replace('_right', '') for col in duplicate_columns]

# Create a dictionary to map unique column names to their '_left' and '_right' counterparts
column_mapping = {}
for unique_col in set(unique_columns):
    left_version = f"{unique_col}_left"
    right_version = f"{unique_col}_right"
    column_mapping[unique_col] = [left_version, right_version]

# Determine strategy for each pair: either keep one (if identical or one is preferable) or merge (if complementary)
# This will be a manual process based on column content. For simplicity, we'll initially assume to keep '_left' versions
# and drop '_right', unless inspection reveals a need for a different approach.

# Columns to keep without change (initially all '_left' versions and unique ones without such suffixes)
columns_to_keep = [col for col in final_merged_df.columns if '_right' not in col]

# Drop '_right' columns from the dataframe
cleaned_df = final_merged_df[columns_to_keep].copy()

# For demonstration, let's rename '_left' suffixes to remove them and clean up column names
cleaned_df.columns = [col.replace('_left', '') if '_left' in col else col for col in cleaned_df.columns]

# Display the cleaned dataframe structure
# cleaned_df.head(), cleaned_df.columns.tolist()


In [72]:
cleaned_df.columns,cleaned_df.shape

(Index(['budget', 'genres', 'homepage', 'id', 'keywords_x', 'original_language',
        'original_title', 'overview', 'popularity', 'production_companies',
        'production_countries', 'release_date', 'revenue', 'runtime',
        'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
        'vote_count', 'movie_id', 'title_y', 'cast', 'crew', 'keywords_y',
        'adult', 'belongs_to_collection', 'imdb_id', 'poster_path', 'title',
        'video'],
       dtype='object'),
 (4852, 31))

In [73]:
remove_columns = ['keywords_y','title_y']
cleaned_df = cleaned_df.drop(remove_columns,axis=1)
cleaned_df.shape

(4852, 29)

In [50]:
# list(cleaned_df.columns)

In [74]:
def extract_data(col, df):
    genres_list = []
    for index,row in tqdm(df.iterrows()):
        genres_info = row[col]
        genres_list_row = ast.literal_eval(genres_info)
        all_genres_types = [i.get('name') for i in genres_list_row]
        genres_str = ','.join([str(elem) for elem in all_genres_types])
        genres_list.append(genres_str)
    return genres_list

In [75]:
cleaned_genres = extract_data('genres', cleaned_df)
cleaned_df['cleaned_genres'] = cleaned_genres

4852it [00:00, 10371.08it/s]


In [76]:
cleaned_keywords = extract_data('keywords_x', cleaned_df)
cleaned_df['cleaned_keywords'] = cleaned_keywords

4852it [00:00, 6450.09it/s]


In [77]:
cleaned_prod_comp = extract_data('production_companies', cleaned_df)
cleaned_df['cleaned_production_companies'] = cleaned_prod_comp

4852it [00:00, 9444.07it/s] 


In [78]:
cleaned_prod_countries = extract_data('production_countries', cleaned_df)
cleaned_df['cleaned_production_countries'] = cleaned_prod_countries

4852it [00:00, 11389.01it/s]


In [79]:
cleaned_spoken_languages = extract_data('spoken_languages', cleaned_df)
cleaned_df['cleaned_spoken_languages'] = cleaned_spoken_languages

4852it [00:00, 10711.02it/s]


In [80]:
get_character_name = extract_data('cast', cleaned_df)
cleaned_df['cleaned_cast'] = get_character_name

4852it [00:03, 1297.34it/s]


In [81]:
get_crew_name = extract_data('crew', cleaned_df)
cleaned_df['cleaned_crew'] = get_crew_name

4852it [00:04, 1172.53it/s]


In [21]:
# cleaned_df['cleaned_genres'] + ' ' + cleaned_df['cleaned_keywords'] + ' ' + cleaned_df['original_language']+ ' ' + cleaned_df['overview'] + ' ' + cleaned_df['cleaned_production_companies']

In [82]:
cleaned_df['combined_text'] = cleaned_df['cleaned_genres'] + ' ' + cleaned_df['cleaned_keywords'] + ' ' + cleaned_df['original_language']+ ' ' + cleaned_df['overview'] + ' ' + cleaned_df['cleaned_production_companies'] + ' ' +  cleaned_df['cleaned_production_countries'] + ' ' + cleaned_df['cleaned_spoken_languages'] + ' ' + cleaned_df['tagline'] + ' ' + cleaned_df['cleaned_cast'] + ' ' + cleaned_df['cleaned_crew']

In [21]:
# cleaned_df['cleaned_production_countries'].iloc[11]

In [22]:
# cleaned_df['combined_text'].iloc[11]

In [60]:
# from sentence_transformers import SentenceTransformer
# # model = SentenceTransformer('bert-base-nli-mean-tokens')
# multi_qa_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [22]:
# # from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('bert-base-nli-mean-tokens')

In [33]:
# cleaned_df['combined_text_str'] = cleaned_df['combined_text'].apply(lambda x: str(x))

In [35]:
# cleaned_df['combined_embeddings'] = cleaned_df['combined_text_str'].apply(lambda x: model.encode(x))

In [None]:
# embeddings_df = pd.DataFrame()
# embeddings_df = cleaned_df[['combined_embeddings','title']]

In [27]:
# embeddings_df.to_pickle("embeddings_df_bert.pkl")

In [83]:
unpickled_df = pd.read_pickle("embeddings_df_bert.pkl")
unpickled_df.shape

(4852, 2)

In [84]:
unpickled_df

Unnamed: 0,combined_embeddings,title
0,"[-0.5951478, 0.97025007, 0.03234572, 0.0785742...",Avatar
1,"[-0.3901741, 0.7123572, 0.723126, 0.5925892, 0...",Pirates of the Caribbean: At World's End
2,"[-0.08191218, 0.6235143, 0.40724906, -0.005524...",Spectre
3,"[-0.39101967, 0.26441473, 0.37008652, 0.158049...",The Dark Knight Rises
4,"[-0.69936043, 0.8608328, 0.40289634, 0.3395809...",John Carter
...,...,...
4847,"[-0.03972727, -0.07696694, 0.54768676, 0.19541...",El Mariachi
4848,"[-0.51017284, 0.97050416, 1.3249298, 0.3194456...",Newlyweds
4849,"[0.029933913, -0.15620214, 1.2514101, -0.29579...","Signed, Sealed, Delivered"
4850,"[-1.0525036, 0.90318936, 0.08746941, 0.2358224...",Shanghai Calling


In [85]:
# Function to get similar items based on combined name and category
def get_similar_items(combined_input, df, model, top_n=6 ):
    
    combined_embedding = model.encode(combined_input)
    
    # Calculate cosine similarity between the combined input and all other combined texts
    similarities = util.pytorch_cos_sim(combined_embedding, df['combined_embeddings'])
    
    #print(similarities)
    # Get the indices of top N similar items
    similar_indices = similarities.argsort(descending=True, axis=1)[0][:top_n]
    #print(similar_indices)
    # Retrieve the similar items from the DataFrame
    similar_items_df = df.iloc[similar_indices][['title']]
    #print(similar_items)
    return similar_items_df

In [86]:
cleaned_df

Unnamed: 0,budget,genres,homepage,id,keywords_x,original_language,original_title,overview,popularity,production_companies,...,title,video,cleaned_genres,cleaned_keywords,cleaned_production_companies,cleaned_production_countries,cleaned_spoken_languages,cleaned_cast,cleaned_crew,combined_text
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,Avatar,False,"Action,Adventure,Fantasy,Science Fiction","culture clash,future,space war,space colony,so...","Ingenious Film Partners,Twentieth Century Fox ...","United States of America,United Kingdom","English,Español","Sam Worthington,Zoe Saldana,Sigourney Weaver,S...","Stephen E. Rivkin,Rick Carter,Christopher Boye...","Action,Adventure,Fantasy,Science Fiction cultu..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,Pirates of the Caribbean: At World's End,False,"Adventure,Fantasy,Action","ocean,drug abuse,exotic island,east india trad...","Walt Disney Pictures,Jerry Bruckheimer Films,S...",United States of America,English,"Johnny Depp,Orlando Bloom,Keira Knightley,Stel...","Dariusz Wolski,Gore Verbinski,Jerry Bruckheime...","Adventure,Fantasy,Action ocean,drug abuse,exot..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,Spectre,False,"Action,Adventure,Crime","spy,based on novel,secret agent,sequel,mi6,bri...","Columbia Pictures,Danjaq,B24","United Kingdom,United States of America","Français,English,Español,Italiano,Deutsch","Daniel Craig,Christoph Waltz,Léa Seydoux,Ralph...","Thomas Newman,Sam Mendes,Anna Pinnock,John Log...","Action,Adventure,Crime spy,based on novel,secr..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,The Dark Knight Rises,False,"Action,Crime,Drama,Thriller","dc comics,crime fighter,terrorist,secret ident...","Legendary Pictures,Warner Bros.,DC Entertainme...",United States of America,English,"Christian Bale,Michael Caine,Gary Oldman,Anne ...","Hans Zimmer,Charles Roven,Christopher Nolan,Ch...","Action,Crime,Drama,Thriller dc comics,crime fi..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,John Carter,False,"Action,Adventure,Science Fiction","based on novel,mars,medallion,space travel,pri...",Walt Disney Pictures,United States of America,English,"Taylor Kitsch,Lynn Collins,Samantha Morton,Wil...","Andrew Stanton,Andrew Stanton,John Lasseter,Co...","Action,Adventure,Science Fiction based on nove..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4847,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]",...,El Mariachi,False,"Action,Crime,Thriller","united states–mexico barrier,legs,arms,paper k...",Columbia Pictures,"Mexico,United States of America",Español,"Carlos Gallardo,Jaime de Hoyos,Peter Marquardt...","Robert Rodriguez,Robert Rodriguez,Robert Rodri...","Action,Crime,Thriller united states–mexico bar..."
4848,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],...,Newlyweds,False,"Comedy,Romance",,,,,"Edward Burns,Kerry Bishé,Marsha Dietlein,Caitl...","Edward Burns,Edward Burns,Edward Burns,William...","Comedy,Romance en A newlywed couple's honeymo..."
4849,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...",...,"Signed, Sealed, Delivered",False,"Comedy,Drama,Romance,TV Movie","date,love at first sight,narration,investigati...","Front Street Pictures,Muse Entertainment Enter...",United States of America,English,"Eric Mabius,Kristin Booth,Crystal Lowe,Geoff G...","Carla Hetland,Harvey Kahn,Adam Sliwinski,Marth...",
4850,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],...,Shanghai Calling,False,,,,"United States of America,China",English,"Daniel Henney,Eliza Coupe,Bill Paxton,Alan Ruc...","Daniel Hsia,Daniel Hsia",en When ambitious New York attorney Sam is s...


In [87]:
# Inference is done

import warnings
warnings.filterwarnings('ignore')
infer_df = cleaned_df.head(1)

infer_cleaned_genres = extract_data('genres', infer_df)
infer_df['cleaned_genres'] = infer_cleaned_genres

infer_cleaned_keywords = extract_data('keywords_x', infer_df)
infer_df['cleaned_keywords'] = infer_cleaned_keywords

infer_cleaned_prod_comp = extract_data('production_companies', infer_df)
infer_df['cleaned_production_companies'] = infer_cleaned_prod_comp

infer_prod_countries = extract_data('production_countries', infer_df)
infer_df['cleaned_production_countries'] = infer_prod_countries

infer_spoken_languages = extract_data('spoken_languages', infer_df)
infer_df['cleaned_spoken_languages'] = infer_spoken_languages

infer_get_character_name = extract_data('cast', infer_df)
infer_df['cleaned_cast'] = infer_get_character_name

infer_get_crew_name = extract_data('crew', infer_df)
infer_df['cleaned_crew'] = infer_get_crew_name

infer_df['combined_text'] = infer_df['cleaned_genres'] + ' ' + infer_df['cleaned_keywords'] + ' ' + infer_df['original_language']+ ' ' + infer_df['overview'] + ' ' + infer_df['cleaned_production_companies'] + ' ' +  infer_df['cleaned_production_countries'] + ' ' + infer_df['cleaned_spoken_languages'] + ' ' + infer_df['tagline'] + ' ' + infer_df['cleaned_cast'] + ' ' + infer_df['cleaned_crew']

infer_df['combined_text_str'] = infer_df['combined_text'].apply(lambda x: str(x))

1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, 996.04it/s]
1it [00:00, ?it/s]
1it [00:00, 196.05it/s]
1it [00:00, 199.87it/s]


In [36]:
# infer_df['combined_text_str'].values[0]

"Action,Adventure,Fantasy,Science Fiction culture clash,future,space war,space colony,society,space travel,futuristic,romance,space,alien,tribe,alien planet,cgi,marine,soldier,battle,love affair,anti war,power relations,mind and soul,3d en In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Ingenious Film Partners,Twentieth Century Fox Film Corporation,Dune Entertainment,Lightstorm Entertainment United States of America,United Kingdom English,Español Enter the World of Pandora. Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,Michelle Rodriguez,Giovanni Ribisi,Joel David Moore,CCH Pounder,Wes Studi,Laz Alonso,Dileep Rao,Matt Gerald,Sean Anthony Moran,Jason Whyte,Scott Lawrence,Kelly Kilgour,James Patrick Pitt,Sean Patrick Murphy,Peter Dillon,Kevin Dorman,Kelson Henderson,David Van Horn,Jacob Tomuri,Michael Blain-Rozgay,Jon Curry,Luke Hawker,Woody Schultz,

In [50]:
!pip install sentence_transformers



You should consider upgrading via the 'c:\users\admin\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [38]:
# # # Making Simillar Product Serach based on Input

# from sentence_transformers import util
# # Get similar items based on combined name and category
# product_info_to_search = infer_df['combined_text_str'].values[0]

# product_info_to_search

# # similar_items = get_similar_items(product_info_to_search, unpickled_df, multi_qa_model)


# # # Print the formatted output
# # # print(f"Product: {product_info_to_search}")
# # print("\nSimilar Movies:")
# # for idx, row in similar_items.iterrows():
# #     print(f"Product : {row['title']}")

"Action,Adventure,Fantasy,Science Fiction culture clash,future,space war,space colony,society,space travel,futuristic,romance,space,alien,tribe,alien planet,cgi,marine,soldier,battle,love affair,anti war,power relations,mind and soul,3d en In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Ingenious Film Partners,Twentieth Century Fox Film Corporation,Dune Entertainment,Lightstorm Entertainment United States of America,United Kingdom English,Español Enter the World of Pandora. Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,Michelle Rodriguez,Giovanni Ribisi,Joel David Moore,CCH Pounder,Wes Studi,Laz Alonso,Dileep Rao,Matt Gerald,Sean Anthony Moran,Jason Whyte,Scott Lawrence,Kelly Kilgour,James Patrick Pitt,Sean Patrick Murphy,Peter Dillon,Kevin Dorman,Kelson Henderson,David Van Horn,Jacob Tomuri,Michael Blain-Rozgay,Jon Curry,Luke Hawker,Woody Schultz,

In [40]:
# # # Making Simillar Product Serach based on Input

# from sentence_transformers import util
# # Get similar items based on combined name and category
# product_info_to_search = infer_df['combined_text_str'].values[0]

# similar_items = get_similar_items(product_info_to_search, unpickled_df)


# # Print the formatted output
# # print(f"Product: {product_info_to_search}")
# print("\nSimilar Movies:")
# for idx, row in similar_items.iterrows():
#     print(f"Title : {row['title']}")

In [90]:
# # Making Simillar Product Serach based on Input

from sentence_transformers import util
# Get similar items based on combined name and category
product_info_to_search = infer_df['combined_text_str'].values[0]

similar_items = get_similar_items(product_info_to_search, unpickled_df,model)


# Print the formatted output
# print(f"Product: {product_info_to_search}")
print("\nSimilar Movies:")
for idx, row in similar_items.iterrows():
    print(f"Product : {row['title']}")


Similar Movies:
Product : Avatar
Product : Escape from Planet Earth
Product : Mission to Mars
Product : Battleship
Product : Galaxy Quest
Product : Planet 51
