# Data preprocessing

In [132]:
import pandas as pd
import numpy as np 
import ast
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

In [133]:
movies_df=pd.read_csv('./data/movies_metadata.csv')
def has_special_char(s):
    regex_pattern = r'[^a-zA-Z0-9\s]'
    return bool(pd.Series(s).str.contains(regex_pattern).any())

  movies_df=pd.read_csv('./data/movies_metadata.csv')


In [134]:
mask1 = movies_df['imdb_id']=='0'
mask2 = movies_df['id'] == '0'
mask3 = movies_df['id'].apply(has_special_char)
mask4 = movies_df['title'] == 'NaN'
mask = mask1 | mask2 | mask3 | mask4 
res_rows = movies_df[mask]

mask = ~mask
movies_df = movies_df[mask]
movies_df['id']=movies_df['id'].astype(int)

In [135]:
movies_df['description'] = movies_df['tagline'] + movies_df['overview']
movies_df['description'] = movies_df['description'].fillna('')

mask = movies_df['description'] == ''
mask = ~mask
movies_df = movies_df[mask]

In [137]:
keywords_df = pd.read_csv('./data/keywords.csv')
keywords_df = keywords_df.drop_duplicates(subset=['id'])

In [138]:
movies_df=movies_df.merge(keywords_df, on='id', how='inner')
movies_df = movies_df.drop_duplicates(subset=['id'])

In [139]:
specified_column = 'title'

duplicates = movies_df[movies_df.duplicated(subset=[specified_column], keep=False)]

selected_columns = ['id', 'title', 'description', 'genres', 'keywords', 'release_date']
result = duplicates[selected_columns].sort_values(by='title')
display(result)

In [141]:
movies_df['genres'] = movies_df['genres'].apply(ast.literal_eval)
movies_df['genres'] = movies_df['genres'].apply(lambda genres: [genre['name'] for genre in genres])

movies_df['keywords'] = movies_df['keywords'].apply(ast.literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(lambda keywords: [keyword['name'] for keyword in keywords])

In [142]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_description(description):
    if isinstance(description, str):
        tokens = word_tokenize(description.lower())
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
        preprocessed_description = ' '.join(lemmatized_tokens)
        return preprocessed_description
    else:
        return ''
    
movies_df['preprocessed_description'] = movies_df['description'].apply(preprocess_description)


[nltk_data] Downloading package punkt to /home/lukrecija/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lukrecija/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lukrecija/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [143]:
mask = movies_df['preprocessed_description'] == ''
mask = ~mask
movies_df = movies_df[mask]

In [144]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,description,keywords,preprocessed_description
0,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,Roll the dice and unleash the excitement!When ...,"[board game, disappearance, based on children'...",roll dice unleash excitement sibling judy pete...
1,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,Still Yelling. Still Fighting. Still Ready for...,"[fishing, best friend, duringcreditsstinger, o...",still yelling still fighting still ready famil...
2,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,Friends are the people who let you be yourself...,"[based on novel, interracial relationship, sin...",friend people let never let forget mistreated ...
3,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Just When His World Is Back To Normal... He's ...,"[baby, midlife crisis, confidence, aging, daug...",world back normal surprise life george bank re...
4,False,,60000000,"[Action, Crime, Drama, Thriller]",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,A Los Angeles Crime SagaObsessive master thief...,"[robbery, detective, bank, obsession, chase, s...",los angeles crime sagaobsessive master thief n...


In [145]:
empty_keywords_rows = movies_df[movies_df['keywords'].apply(lambda x: len(x) == 0)]

empty_keywords_ids = empty_keywords_rows['id']
display(empty_keywords_rows)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,description,keywords,preprocessed_description
6,False,,0,"[Action, Adventure, Drama, Family]",,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0,"The Original Bad Boys.A mischievous young boy,...",[],original bad mischievous young boy tom sawyer ...
41,False,,0,"[Action, Thriller, Drama]",,117164,tt0109950,en,Guardian Angel,Detective - turned - bodyguard Cynthia McKay (...,...,[],Released,She's no angel of mercy.,Guardian Angel,False,6.3,3.0,She's no angel of mercy.Detective - turned - b...,[],angel turned bodyguard cynthia mckay cynthia r...
51,False,,0,"[Drama, Family]",,40628,tt0114753,en,Two Bits,"It's a hot summer day in 1933 in South Philly,...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,You're never too old to believe in a dream. Or...,Two Bits,False,5.4,11.0,You're never too old to believe in a dream. Or...,[],never old believe dream young make one come tr...
73,False,,0,"[Action, Adventure, Drama, Science Fiction, Th...",,45549,tt0111173,en,Shopping,"A dark, hip, urban story of a barren and anony...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,No one leaves without paying...,Shopping,False,5.6,13.0,"No one leaves without paying...A dark, hip, ur...",[],one leaf without paying dark hip urban story b...
81,False,,0,[Documentary],,89333,tt0112646,en,Catwalk,A documentary following Christy Turlington and...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The moment when fashion, art and entertainment...",Catwalk,False,7.0,2.0,"The moment when fashion, art and entertainment...",[],moment fashion art entertainment come together...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20389,False,,0,"[Comedy, Drama, Romance]",,174271,tt0116345,en,The Fortunes and Misfortunes of Moll Flanders,In her filthy cell in Newgate prison Moll Flan...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Notorius Scandalous Unforgettable,The Fortunes and Misfortunes of Moll Flanders,False,5.8,8.0,Notorius Scandalous UnforgettableIn her filthy...,[],notorius scandalous unforgettablein filthy cel...
20390,False,,0,"[Comedy, Horror]",,65416,tt0251582,en,An American Vampire Story,A group of friends go on a vacation with some ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,You're not going anywhere,An American Vampire Story,False,4.5,4.0,You're not going anywhereA group of friends go...,[],going anywherea group friend go vacation new f...
20393,False,,0,"[Action, Mystery, Thriller, Horror]",,45527,tt1331329,en,The Final Storm,A stranger named Silas flees from a devastatin...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Action, Horror",The Final Storm,False,3.7,11.0,"Action, HorrorA stranger named Silas flees fro...",[],action horrora stranger named silas flees deva...
20399,False,,0,"[Comedy, Drama]",,420346,tt4130180,en,The Morning After,The Morning After is a feature film that consi...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What happened last night?,The Morning After,False,4.0,2.0,What happened last night?The Morning After is ...,[],happened last night morning feature film consi...


In [146]:
def generate_keywords(description):
    if not description:
        return []
    
    vectorizer = TfidfVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
    X = vectorizer.fit_transform([description])
    keywords = list(vectorizer.get_feature_names_out())
    keywords = [word for word in keywords if not any(char.isdigit() for char in word)]
    return keywords[:6]

mask = movies_df['keywords'].apply(lambda x: len(x) == 0)
movies_df.loc[mask, 'keywords'] = movies_df.loc[mask, 'description'].apply(generate_keywords)
movies_df[movies_df['id'].isin(empty_keywords_ids)].head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,description,keywords,preprocessed_description
6,False,,0,"[Action, Adventure, Drama, Family]",,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0,"The Original Bad Boys.A mischievous young boy,...","[accused, adventures, alcoholic, bad, boy, boys]",original bad mischievous young boy tom sawyer ...
41,False,,0,"[Action, Thriller, Drama]",,117164,tt0109950,en,Guardian Angel,Detective - turned - bodyguard Cynthia McKay (...,...,[],Released,She's no angel of mercy.,Guardian Angel,False,6.3,3.0,She's no angel of mercy.Detective - turned - b...,"[accompanies, action, angel, bodyguard, cynthi...",angel turned bodyguard cynthia mckay cynthia r...
51,False,,0,"[Drama, Family]",,40628,tt0114753,en,Two Bits,"It's a hot summer day in 1933 in South Philly,...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,You're never too old to believe in a dream. Or...,Two Bits,False,5.4,11.0,You're never too old to believe in a dream. Or...,"[act, ailing, believe, business, buy, come]",never old believe dream young make one come tr...
73,False,,0,"[Action, Adventure, Drama, Science Fiction, Th...",,45549,tt0111173,en,Shopping,"A dark, hip, urban story of a barren and anony...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,No one leaves without paying...,Shopping,False,5.6,13.0,"No one leaves without paying...A dark, hip, ur...","[aid, anonymous, arrive, barren, billy, business]",one leaf without paying dark hip urban story b...
81,False,,0,[Documentary],,89333,tt0112646,en,Catwalk,A documentary following Christy Turlington and...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The moment when fashion, art and entertainment...",Catwalk,False,7.0,2.0,"The moment when fashion, art and entertainment...","[art, beauty, christy, come, documentary, ente...",moment fashion art entertainment come together...


In [147]:
movies_df['genres'] = movies_df['genres'].apply(lambda x: str(x).replace('[', '{').replace(']', '}'))

movies_df['keywords'] = movies_df['keywords'].apply(lambda x: str(x).replace('[', '{').replace(']', '}'))

In [148]:
def remove_single_quotes(x):
    return re.sub(r"(?<![a-zA-Z])'|'(?![a-zA-Z])", "", x) if pd.notnull(x) else x

movies_df['keywords'] = movies_df['keywords'].apply(remove_single_quotes)

In [149]:
movies_df['keywords'] = movies_df['keywords'].apply(str).str.replace('"', '')

movies_df['genres'] = movies_df['genres'].apply(str).str.replace("'", '')

In [150]:
movies_df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,description,keywords,preprocessed_description
0,False,,65000000,"{Adventure, Fantasy, Family}",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,Roll the dice and unleash the excitement!When ...,"{board game, disappearance, based on children'...",roll dice unleash excitement sibling judy pete...
1,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"{Romance, Comedy}",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,Still Yelling. Still Fighting. Still Ready for...,"{fishing, best friend, duringcreditsstinger, o...",still yelling still fighting still ready famil...


In [152]:
min_vote_count = movies_df[movies_df['vote_count'] > 0]['vote_count'].min()
max_vote_count = movies_df['vote_count'].max()

print(f"Minimum vote_count: {min_vote_count}")
print(f"Maximum vote_count: {max_vote_count}")


Minimum vote_count: 1.0
Maximum vote_count: 14075.0


In [153]:
m = movies_df['vote_count'].quantile(0.7) 
C = movies_df['vote_average'].mean()

def calculate_weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m)) * R + (m / (m + v)) * C

movies_df['weighted_rating'] = movies_df.apply(calculate_weighted_rating, args=(m, C), axis=1)


In [125]:
columns_to_drop=['adult', 'belongs_to_collection', 'budget', 'homepage',
       'original_language', 'original_title', 'overview',
       'popularity',  'production_companies',
       'production_countries', 'revenue', 
       'spoken_languages', 'status', 'tagline', 'video',
       'vote_average', 'vote_count']

movies_df.drop(columns=columns_to_drop, inplace=True)

In [126]:
movies_df.head()

Unnamed: 0,genres,id,imdb_id,poster_path,release_date,runtime,title,description,keywords,preprocessed_description,weighted_rating
0,"{Adventure, Fantasy, Family}",8844,tt0113497,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,104.0,Jumanji,Roll the dice and unleash the excitement!When ...,"{board game, disappearance, based on children'...",roll dice unleash excitement sibling judy pete...,6.86723
1,"{Romance, Comedy}",15602,tt0113228,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,101.0,Grumpier Old Men,Still Yelling. Still Fighting. Still Ready for...,"{fishing, best friend, duringcreditsstinger, o...",still yelling still fighting still ready famil...,6.170573
2,"{Comedy, Drama, Romance}",31357,tt0114885,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,127.0,Waiting to Exhale,Friends are the people who let you be yourself...,"{based on novel, interracial relationship, sin...",friend people let never let forget mistreated ...,5.856086
3,{Comedy},11862,tt0113041,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,106.0,Father of the Bride Part II,Just When His World Is Back To Normal... He's ...,"{baby, midlife crisis, confidence, aging, daug...",world back normal surprise life george bank re...,5.710835
4,"{Action, Crime, Drama, Thriller}",949,tt0113277,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,1995-12-15,170.0,Heat,A Los Angeles Crime SagaObsessive master thief...,"{robbery, detective, bank, obsession, chase, s...",los angeles crime sagaobsessive master thief n...,7.629771


In [127]:
movies_df.to_csv('./data/preprocessed_data_for_db.csv', index=False)

In [128]:
movies_df['genres'] = movies_df['genres'].apply(lambda x: str(x).replace('{', '').replace('}', ''))
movies_df['genres'] = movies_df['genres'].apply(lambda x: str(x).replace(',', ''))
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: str(x).replace('{', '').replace('}', ''))

movies_df['features'] = movies_df['genres'] + ' ' + movies_df['preprocessed_description'] + ' ' + movies_df['keywords']
movies_df['features'] = movies_df['features'].fillna('')
movies_df['features'] = movies_df['features'].str.lower()

In [129]:
movies_df.to_csv('./data/preprocessed_data.csv', index=False)

In [155]:
ratings = pd.read_csv('./data/ratings_small.csv')

threshold = 3
ratings['binary_rating']=(ratings['rating']>=threshold).astype(int)
ratings.to_csv('./data/ratings.csv', index=False)