In [1]:
import pandas as pd
import numpy as np
import os
import nltk
import string
import ast
import pickle
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/german/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Read the CSV files
history_df = pd.read_csv('../data/netflix_history_preprocessed.csv')
titles_df = pd.read_csv('../data/all_plattforms_titles_preprocessed.csv')

In [4]:
history_df

Unnamed: 0,Title,Start Time,Total Duration
0,tote mädchen lügen nicht,2018-06-04 17:39:00,154323.0
1,haus des geldes,2018-06-17 13:45:00,196086.0
2,naruto,2018-06-28 06:09:00,280744.0
3,death note,2018-06-28 10:51:00,5999.0
4,stranger things,2018-06-29 08:39:00,25353.0
...,...,...,...
270,tiny house nation usa,2022-12-15 17:29:00,33778.0
271,falling for christmas,2022-12-24 20:52:00,5555.0
272,your place or mine,2023-02-13 20:54:00,6257.0
273,anatomie eines skandals,2023-03-05 20:45:00,14850.0


In [5]:
# Convert string representation of list to actual list
titles_df['director'] = titles_df['director'].apply(ast.literal_eval)
titles_df['cast'] = titles_df['cast'].apply(ast.literal_eval)
titles_df['country'] = titles_df['country'].apply(ast.literal_eval)
titles_df['listed_in'] = titles_df['listed_in'].apply(ast.literal_eval)

In [6]:
# Keep only the first occurrence of each title
titles_df = titles_df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)

In [7]:
titles_df

Unnamed: 0,title,director,cast,country,date_added,release_year,listed_in,description,platform
0,dick johnson is dead,[kirsten johnson],[],[united states],"september 25, 2021",2020,[documentaries],"as her father nears the end of his life, filmm...",netflix
1,blood & water,[],"[ama qamata, khosi ngema, gail mabalane, th...",[south africa],"september 24, 2021",2021,"[international tv shows, tv dramas, tv myste...","after crossing paths at a party, a cape town t...",netflix
2,ganglands,[julien leclercq],"[sami bouajila, tracy gotoas, samuel jouy, ...",[],"september 24, 2021",2021,"[crime tv shows, international tv shows, tv ...",to protect his family from a powerful drug lor...,netflix
3,jailbirds new orleans,[],[],[],"september 24, 2021",2021,"[docuseries, reality tv]","feuds, flirtations and toilet talk go down amo...",netflix
4,kota factory,[],"[mayur more, jitendra kumar, ranjan raj, al...",[india],"september 24, 2021",2021,"[international tv shows, romantic tv shows, ...",in a city of coaching centers known to train i...,netflix
...,...,...,...,...,...,...,...,...,...
18113,tomorrowland,[brad bird],"[george clooney, hugh laurie, britt robertso...","[united states, spain, france, canada, uni...","september 3, 2021",2015,"[action-adventure, science fiction]",a jaded genius and an optimistic teen unearth ...,disney_plus
18114,aquamarine,[elizabeth allen rosenbaum],"[jake mcdorman, arielle kebbel, claudia karv...",[united states],"august 13, 2021",2006,"[comedy, coming of age, fantasy]",two 13-year-old best friends embark on an adve...,disney_plus
18115,eddie the eagle,[dexter fletcher],"[tom costello, jo hartley, keith allen, dic...","[united kingdom, germany, united states]","december 18, 2020",2016,"[biographical, comedy, drama]","true story of eddie edwards, a british ski-jum...",disney_plus
18116,bend it like beckham,[gurinder chadha],"[parminder nagra, keira knightley, jonathan ...","[united kingdom, germany, united states]","september 18, 2020",2003,"[buddy, comedy, coming of age]",despite the wishes of their traditional famili...,disney_plus


In [8]:
history_titles_set = set(history_df['Title'])
titles_set = set(titles_df['title'])

In [9]:
overlaps = history_titles_set.intersection(titles_set)

In [10]:
en_history_df = history_df[history_df['Title'].isin(overlaps)]

In [11]:
en_history_df

Unnamed: 0,Title,Start Time,Total Duration
2,naruto,2018-06-28 06:09:00,280744.0
3,death note,2018-06-28 10:51:00,5999.0
4,stranger things,2018-06-29 08:39:00,25353.0
6,cowspiracy,2018-07-01 19:08:00,10585.0
8,house of cards,2018-07-20 15:19:00,126137.0
...,...,...,...
250,a christmas prince,2021-12-24 14:50:00,15556.0
252,the last kingdom,2022-01-03 08:03:00,136762.0
253,the game changers,2022-02-01 11:58:00,4887.0
255,seaspiracy,2022-02-22 15:22:00,3177.0


In [12]:
watch_history = en_history_df['Title'].to_list()
watch_history

['naruto',
 'death note',
 'stranger things',
 'cowspiracy',
 'house of cards',
 'welcome to the family',
 'titanic',
 'riverdale',
 'gossip girl',
 'orange is the new black',
 'sierra burgess is a loser',
 'to all the boys i’ve loved before',
 'the kissing booth',
 'greenhouse academy',
 'insatiable',
 '#realityhigh',
 'dude',
 'you get me',
 'shooter',
 'american horror story',
 'chilling adventures of sabrina',
 'black butler',
 'baby',
 'sex education',
 'bodyguard',
 'suits',
 'how to get away with murder',
 'the order',
 'on my block',
 'the perfect date',
 'the protector',
 'bonding',
 'the last summer',
 'kidnapping stella',
 'sintonia',
 'the seven deadly sins',
 'the irregular at magic high school',
 'cam',
 'naruto shippuden',
 'castlevania',
 'how to sell drugs online (fast)',
 'the end of the f***ing world',
 'baki',
 'grimoire of zero',
 'the irishman',
 'revisions',
 'hot girls wanted',
 'the witcher',
 'mulan',
 'the blacklist',
 'flavors of youth',
 'attack on titan',


In [13]:
def preprocess_text(text):
    # Tokenization
    tokens = nltk.tokenize.word_tokenize(text.lower())

    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

In [14]:
titles_df['description'] = titles_df['description'].apply(preprocess_text)

In [15]:
def preprocess_name(name_list):
    # Remove spaces between each name
    return [name.replace(' ', '') for name in name_list]

In [16]:
titles_df['director'] = titles_df['director'].apply(preprocess_name)
titles_df['cast'] = titles_df['cast'].apply(preprocess_name)

In [17]:
# Flatten the list of actor names
actor_names = [name for sublist in titles_df['cast'] for name in sublist]

# Count the occurrences of each actor name
name_counts = Counter(actor_names)

In [18]:
def keep_top_three_actors(actor_list):
    if len(actor_list) == 0:
        return []
    # Keep only the top k most frequent actors
    actor_list.sort(key=lambda x: name_counts[x], reverse=True)
    return actor_list[:3]

In [19]:
titles_df['cast'] = titles_df['cast'].apply(keep_top_three_actors)

In [20]:
titles_df

Unnamed: 0,title,director,cast,country,date_added,release_year,listed_in,description,platform
0,dick johnson is dead,[kirstenjohnson],[],[united states],"september 25, 2021",2020,[documentaries],father near end life filmmak kirsten johnson s...,netflix
1,blood & water,[],"[thabangmolaba, cindymahlangu, patrickmofokeng]",[south africa],"september 24, 2021",2021,"[international tv shows, tv dramas, tv myste...",cross path parti cape town teen set prove whet...,netflix
2,ganglands,[julienleclercq],"[samibouajila, samueljouy, sofialesaffre]",[],"september 24, 2021",2021,"[crime tv shows, international tv shows, tv ...",protect famili power drug lord skill thief meh...,netflix
3,jailbirds new orleans,[],[],[],"september 24, 2021",2021,"[docuseries, reality tv]",feud flirtat toilet talk go among incarcer wom...,netflix
4,kota factory,[],"[jitendrakumar, mayurmore, ahsaaschanna]",[india],"september 24, 2021",2021,"[international tv shows, romantic tv shows, ...",citi coach center known train india ’ finest c...,netflix
...,...,...,...,...,...,...,...,...,...
18113,tomorrowland,[bradbird],"[kathrynhahn, georgeclooney, hughlaurie]","[united states, spain, france, canada, uni...","september 3, 2021",2015,"[action-adventure, science fiction]",jade geniu optimist teen unearth secret tomorr...,disney_plus
18114,aquamarine,[elizabethallenrosenbaum],"[ariellekebbel, brucespence, claudiakarvan]",[united states],"august 13, 2021",2006,"[comedy, coming of age, fantasy]",two 13-year-old best friend embark adventur di...,disney_plus
18115,eddie the eagle,[dexterfletcher],"[taronegerton, johartley, keithallen]","[united kingdom, germany, united states]","december 18, 2020",2016,"[biographical, comedy, drama]",true stori eddi edward british ski-jump whose ...,disney_plus
18116,bend it like beckham,[gurinderchadha],"[anupamkher, keiraknightley, parmindernagra]","[united kingdom, germany, united states]","september 18, 2020",2003,"[buddy, comedy, coming of age]",despit wish tradit famili two girl aim career ...,disney_plus


In [21]:
# Calculate TF-IDF vectors for processed titles and descriptions
tfidf_vectorizer = TfidfVectorizer()
titles_tfidf = tfidf_vectorizer.fit_transform(titles_df['description'])

# Calculate cosine similarity
similarity_scores = cosine_similarity(titles_tfidf, titles_tfidf)

In [22]:
# Function to check if two lists have overlapping elements
def have_overlap(list1, list2):
    return bool(set(list1) & set(list2))

def create_overlap_matrix(column_name):
    matrix_size = len(titles_df)
    overlap_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

    column = titles_df[column_name].to_list()
    for i in range(matrix_size):
        for j in range(matrix_size):
            if have_overlap(column[i], column[j]):
                overlap_matrix[i, j] = 1

    return overlap_matrix

In [23]:
overlap_director = create_overlap_matrix('director')
overlap_cast = create_overlap_matrix('cast')
overlap_country = create_overlap_matrix('country')
overlap_genre = create_overlap_matrix('listed_in')

In [24]:
# Evaluation of the recommendation
def evaluate(x1, x2, x3, x4, x5, consider_history=False):
    target_ranks = []
    combined_scores = x1 * similarity_scores + x2 * overlap_director + x3 * overlap_cast + x4 * overlap_country + x5 * overlap_genre
    scores = np.zeros(combined_scores.shape[0])

    for i in range(1, len(watch_history)):
        target_title = watch_history[i]
        target_row_index = titles_df.index[titles_df['title'] == target_title].tolist()[0]
        prev_title = watch_history[i - 1]
        prev_row_index = titles_df.index[titles_df['title'] == prev_title].tolist()[0]

        # Get recommendation based on the similarity
        if consider_history:
            scores += combined_scores[prev_row_index]
        else:
            scores = combined_scores[prev_row_index]
        recommendation_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
        target_rank = recommendation_indices.index(target_row_index)
        target_ranks.append(target_rank)

    print('Average rank:', np.mean(target_ranks))
    print('Successful recommendations:', np.sum(np.array(target_ranks) <= 5))

In [25]:
evaluate(1, 0, 0, 0, 0)

Average rank: 4661.890756302521
Successful recommendations: 3


In [26]:
evaluate(1, 0, 0, 0, 0, True)

Average rank: 6336.285714285715
Successful recommendations: 0


In [None]:
evaluate(0, 1, 2, 0.5, 2)

Average rank: 2491.3303571428573
Successful recommendations: 2


In [None]:
evaluate(0, 1, 2, 0.5, 2, True)

Average rank: 2440.5535714285716
Successful recommendations: 1


In [None]:
evaluate(50, 1, 2, 0.5, 2)

Average rank: 2638.8928571428573
Successful recommendations: 3


In [None]:
evaluate(50, 1, 2, 0.5, 2, True)

Average rank: 2307.4910714285716
Successful recommendations: 0


In [27]:
combined_scores = 50 * similarity_scores + 1 * overlap_director + 2 * overlap_cast + 0.5 * overlap_country + 2 * overlap_genre
combined_scores = np.array(combined_scores, dtype=np.float32)

pickle.dump(combined_scores, open('../data/similarity_tfidf.pkl', 'wb'))