In [1]:
import pandas as pd

df = pd.read_csv('../data/all_data_for_10000_lines.csv')

## Ajout de la note pondérée

In [8]:
df.head()

# Paramètres pour le calcul du score pondéré
C = df['averagerating'].mean()  # Score moyen de tous les films
m = 1000  # Nombre minimum de votes requis pour être pris en compte

# Calcul du score pondéré
def weighted_rating(x, m=m, C=C):
    v = x['numvotes']
    R = x['averagerating']
    return (v / (v + m) * R) + (m / (v + m) * C)

df['weighted_score'] = df.apply(weighted_rating, axis=1)


In [9]:
df['tconst'][~df['tconst'].isna()].head().values

array(['tt12605172', 'tt12605176', 'tt12605178', 'tt1260518',
       'tt12605180'], dtype=object)

In [10]:
df.describe()

Unnamed: 0,startyear,endyear,runtimeminutes,averagerating,numvotes,seasonnumber,episodenumber,regionnumber,weighted_score
count,7388.0,107.0,2333.0,1027.0,1027.0,6692.0,6692.0,10000.0,1027.0
mean,2007.504602,2007.130841,40.952422,6.956767,606.494645,3.455021,531.684549,4.7208,6.926801
std,19.236246,16.892046,37.221751,1.385817,6620.788704,7.686266,1507.838716,3.46122,0.32637
min,1906.0,1951.0,1.0,1.1,5.0,1.0,0.0,1.0,4.405601
25%,2005.0,1997.5,17.0,6.3,11.0,1.0,7.0,1.0,6.94324
50%,2015.0,2015.0,30.0,7.2,19.0,1.0,50.0,3.0,6.959651
75%,2020.0,2020.0,54.0,7.9,64.5,2.0,335.0,8.0,6.974027
max,2025.0,2023.0,435.0,10.0,177953.0,82.0,13897.0,50.0,8.683838


## Identifier les collone numériques et Catégorielles

In [11]:
df.info()

# Les colonnes numériques identifiées sont : 
# startyear, endyear, runtimeminutes, averagerating, numvotes, seasonnumber, episodenumber, weighted_score, isadult.

# Les colonnes catégorielles identifiées sont : 
# tconst, titletype, primarytitle, genres, regionlist, actor, self, producer, actress, director.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          10000 non-null  object 
 1   titletype       10000 non-null  object 
 2   primarytitle    10000 non-null  object 
 3   isadult         10000 non-null  bool   
 4   startyear       7388 non-null   float64
 5   endyear         107 non-null    float64
 6   runtimeminutes  2333 non-null   float64
 7   genres          9776 non-null   object 
 8   averagerating   1027 non-null   float64
 9   numvotes        1027 non-null   float64
 10  seasonnumber    6692 non-null   float64
 11  episodenumber   6692 non-null   float64
 12  regionnumber    10000 non-null  int64  
 13  regionlist      10000 non-null  object 
 14  actor           9195 non-null   object 
 15  self            9195 non-null   object 
 16  producer        9195 non-null   object 
 17  actress         9195 non-null   

## fonctions préprocessing

In [12]:
import nltk
from nltk import PorterStemmer
nltk.download('wordnet')
nltk.download('wordnet')

#fonction de stemming

def stemming(liste):
  stemming = []
  for element in liste:
    elementStemme = PorterStemmer().stem(element)
    stemming.append(elementStemme)
  return stemming

import re
import string

#mettre en minuscule et supprimer les caractères spéciaux et les espaces en début et fin de texte 

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s\[\]]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melos\AppData\Roaming\nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Préprocessing collone TitleType

In [13]:
df['titletype'].value_counts()

# La colonne 'titletype' ne présente pas de valeurs incohérentes.
# Elle n'a pas besoin d'être tokenisée, stemmée, ni traitée pour les stop words, 
# mais elle nécessite d'être encodée.


titletype
tvEpisode       7887
short            821
movie            402
video            289
tvSeries         271
tvMovie          159
tvSpecial         65
videoGame         53
tvMiniSeries      44
tvShort            9
Name: count, dtype: int64

In [21]:
df['titletype'].value_counts()
#Utiliser ordinal encoder pour encoder la colonne titletype
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df["titletype"] = encoder.fit_transform(df['titletype'].values.reshape(-1, 1)) #reshape(-1, 1) pour convertir une liste en tableau



## Préprocessing collone Primarytitle

In [25]:
df['primarytitle'].head().values

# On remarque que les titres sont variés et ne se ressemblent pas. Une tokenisation est donc nécessaire.

# Les stop words ne doivent pas être supprimés, car cela pourrait altérer le sens des titres.
# Certains titres sont définis par leurs stop words, par exemple : "The Walking Dead" ou "The Good Place".

# Un stemming est recommandé. Par exemple, si un utilisateur apprécie un film contenant le mot "run", 
# le stemming ou la lemmatisation permettra de faire correspondre ce mot avec d'autres variantes telles que "running" ou "runs". 
# Cela améliore la qualité des recommandations basées sur la similitude des titres.

# Pour un système simple basé sur la présence ou l'absence de mots, le TF-IDF peut être approprié.
# Il représente chaque titre en fonction des mots qu'il contient, avec un poids plus élevé pour les mots rares et significatifs.
# Cela peut aider à identifier les mots associés aux meilleures notes, analyser leur influence, ou améliorer la gestion des titres similaires.


array(['Episode #1.2', 'Shaheb Bibi Golam', 'Girl in the Mirror',
       'Loose Ends', 'Episode #1.3'], dtype=object)

In [26]:
#Collone primarytitle


#Retirer caractères spéciaux, espaces et retourner en minuscule
df['primarytitle_processed'] = df['primarytitle'].apply(clean_text)

#Stemming
df['primarytitle_processed'] = df['primarytitle_processed'].apply(stemming)

#Tokenization
df['primarytitle_processed'] = df['primarytitle_processed'].apply(lambda x : x.split())

df[['primarytitle_processed','primarytitle']].head()






from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Rejointure des tokens en une chaîne de texte
df['primarytitle_processed'] = df['primarytitle_processed'].apply(lambda x: ' '.join(x))

# 1. Initialisation de TfidfVectorizer et vectorisation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['primarytitle_processed'])

# 2. Conversion de la matrice TF-IDF en DataFrame avec préfixe
tfidf_df_primary_title = pd.DataFrame(tfidf_matrix.toarray(), columns=['tfidf_' + word for word in vectorizer.get_feature_names_out()])




In [27]:
df_encoded = pd.concat([df.reset_index(drop=True), tfidf_df_primary_title.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,tfidf_zu,tfidf_zully,tfidf_zum,tfidf_zunami,tfidf_zur,tfidf_zurria,tfidf_zusammen,tfidf_zvui,tfidf_zwerge,tfidf_zwischen
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing collone genres

In [28]:
df['genres']

0             Action,Mystery
1                      Drama
2                Music,Short
3                      Drama
4             Action,Mystery
                ...         
9995    Crime,Drama,Thriller
9996               Adventure
9997                   Drama
9998                   Adult
9999                  Sci-Fi
Name: genres, Length: 10000, dtype: object

In [29]:
df['genres'].value_counts()

# On observe 354 combinaisons de genres différentes.
# Il est nécessaire de les tokeniser, et de les encoder.


genres
Drama                    2203
Talk-Show                 722
Comedy                    598
Documentary               537
Reality-TV                425
                         ... 
Crime,Horror,Mystery        1
Action,Comedy,Fantasy       1
Reality-TV,Sport            1
Action,Drama,Mystery        1
Horror,Mystery              1
Name: count, Length: 354, dtype: int64

In [30]:
df['genres'].apply(lambda x : type(x)).value_counts() #On compte les données de type str

df[df['genres'].apply(lambda x: isinstance(x, float))] #On vérifie les valeurs nulles

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,episodenumber,regionnumber,regionlist,actor,self,producer,actress,director,weighted_score,primarytitle_processed
36,tt1260526,2.0,Hank Williams Jr.,False,1979.0,,,,,,...,,1,['\\N'],[],[],[],[],['Steven North'],,hank williams jr
41,tt1260527,2.0,Ricky Nelson,False,1979.0,,,,,,...,,1,['\\N'],[],['Ricky Nelson'],[],[],['Steven North'],,ricky nelson
127,tt1260544,2.0,Episode #1.16,False,2008.0,,90.0,,,,...,16.0,8,"['\\N', 'PT', 'IN', 'FR', 'IT', 'DE', 'JP', 'ES']",[],"['Juha Helppi', 'Robin Keston', 'Dave Mattey',...",[],[],[],,episode 116
236,tt12605676,5.0,CBC News: At Issue,False,2007.0,,30.0,,3.6,7.0,...,,2,"['\\N', 'CA']",[],"['Althia Raj', 'Althia Raj', 'Rosemary Barton'...",[],[],[],6.933433,cbc news at issue
291,tt12605782,2.0,Episode #1.11,False,2020.0,,21.0,,,,...,11.0,8,"['\\N', 'PT', 'IN', 'FR', 'IT', 'DE', 'JP', 'ES']",,,,,,,episode 111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9743,tt12623656,4.0,A tocar!,False,2020.0,,,,,,...,,2,"['\\N', 'ES']",[],"['Camille Decourtye', 'Frederic Amat', 'Blaï M...",[],[],"['Camille Decourtye', 'Blaï Mateu']",,a tocar
9797,tt1262377,2.0,Episode #1.7,False,2008.0,,,,,,...,7.0,8,"['\\N', 'PT', 'IN', 'FR', 'IT', 'DE', 'JP', 'ES']","['David Ramírez', 'Toni Mora']","['Jordi Borràs', 'Carles Rexach', 'Anna Llache...",[],[],"['Òscar Lorca', 'Mai Balaguer']",,episode 17
9801,tt1262378,2.0,Episode #5.40,False,2008.0,,,,,,...,40.0,8,"['\\N', 'PT', 'IN', 'FR', 'IT', 'DE', 'JP', 'ES']",[],"['Susanna Griso', 'Ferran Monegal']",[],[],['Ferran Monegal'],,episode 540
9864,tt12623922,2.0,Episode dated 1 August 2005,False,2005.0,,,,,,...,,8,"['\\N', 'PT', 'IN', 'FR', 'IT', 'DE', 'JP', 'ES']",[],"['José Miguel Viñuela', 'Jessica Cirio', 'Thia...",[],[],[],,episode dated 1 august 2005


In [31]:
# 1. Créer une copie de la colonne 'genres' pour le traitement
df['genres_processed'] = df['genres']

# 2. Supprimer les valeurs de type float en convertissant en chaîne de caractères
df['genres_processed'] = df['genres_processed'].astype(str)

# 3. Analyser les genres : compter les occurrences de chaque genre individuel
df['genres_processed'].apply(lambda x: x.split(',')).explode().value_counts()

# 4. Appliquer MultiLabelBinarizer sur la colonne 'genres_processed'
from sklearn.preprocessing import MultiLabelBinarizer

# Initialisation du binariseur
mlb = MultiLabelBinarizer()

# Conversion de la colonne en listes de genres
df['genres_processed'] = df['genres_processed'].apply(lambda x: x.split(','))

# Appliquer le MultiLabelBinarizer pour créer une matrice binaire
multilabel = mlb.fit_transform(df['genres_processed'])


multilabel

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(10000, 28))

In [32]:
genres_encoded_df = pd.DataFrame(multilabel, columns=mlb.classes_)
df_encoded = pd.concat([df.reset_index(drop=True), genres_encoded_df.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,nan
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0,0,0,1,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0,0,0,0,0,0,1,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0,0,0,0,0,0,0,0,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0,0,0,0,0,0,0,0,0,0


## Transformation des actor

In [33]:
def clean_actor_list(actor_list_str):
    if actor_list_str == "[]":
        return []
    if type(actor_list_str) == float:
        return []
    actor_list_str = actor_list_str.strip("[]").replace("'", "").replace('"', "")
    actor_list = actor_list_str.split(", ")
    return [actor for actor in actor_list if actor]

actor_counts = df["actor"].apply(clean_actor_list).explode().value_counts()
top_actors = actor_counts.head(500).index.tolist()
df["filters_actor"] = df['actor'].apply(clean_actor_list).apply(lambda actor_list: [actor for actor in actor_list if actor in top_actors])
mlb_actors = MultiLabelBinarizer()
actors_vector_df = mlb_actors.fit_transform(df["filters_actor"])
actors_encoded_df = pd.DataFrame(actors_vector_df, columns=mlb_actors.classes_)
df_encoded = pd.concat([df_encoded.reset_index(drop=True), actors_encoded_df.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,Yijun Liu,Yoshi Sakô,Yoshitsugu Matsuoka,Yossi Marshek,Yunlai Xin,Yôsuke Eguchi,Zhilong He,Zishan Rong,Zoran Radmilovic,Ângelo Rodrigues
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0,1,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0,0,0,0,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0,0,0,0,0,0,0,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0,0,0,0,0,0,0,0,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0,0,0,0,0,0,0,0,0,0


### Actress


In [34]:
def clean_actress_list(actress_list_str):
    if actress_list_str == "[]":
        return []
    if type(actress_list_str) == float:
        return []
    actress_list_str = actress_list_str.strip("[]").replace("'", "").replace('"', "")
    actress_list = actress_list_str.split(", ")
    return [actress for actress in actress_list if actress]

actress_counts = df["actress"].apply(clean_actor_list).explode().value_counts()
top_actress = actress_counts.head(500).index.tolist()
df["filters_actress"] = df['actress'].apply(clean_actress_list).apply(lambda actress_list: [actress for actress in actress_list if actress in top_actress])
mlb_actress = MultiLabelBinarizer()
actress_vector_df = mlb_actress.fit_transform(df["filters_actress"])
actress_encoded_df = pd.DataFrame(actress_vector_df, columns=mlb_actress.classes_)
df_encoded = pd.concat([df_encoded.reset_index(drop=True), actress_encoded_df.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,Yan Liu,Yanyan Wang,Yui Horie,Yulia Demóss,Yuliya Menshova,Yuvarani Ravindra,Yôko Hikasa,Yû Kobayashi,Zena Walker,Zoe Petrou
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0,0,0,0,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0,0,0,0,0,0,0,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0,0,0,0,0,0,0,0,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0,0,0,0,0,0,0,0,0,0


### Self


In [35]:
def clean_self_list(self_list_str):
    if self_list_str == "[]":
        return []
    if type(self_list_str) == float:
        return []
    self_list_str = self_list_str.strip("[]").replace("'", "").replace('"', "")
    self_list = self_list_str.split(", ")
    return [self for self in self_list if self]

self_counts = df["self"].apply(clean_self_list).explode().value_counts()
top_self = self_counts.head(500).index.tolist()
df["filters_self"] = df['self'].apply(clean_self_list).apply(lambda self_list: [self for self in self_list if self in top_self])
mlb_self = MultiLabelBinarizer()
self_vector_df = mlb_self.fit_transform(df["filters_self"])
self_encoded_df = pd.DataFrame(self_vector_df, columns=mlb_self.classes_)
df_encoded = pd.concat([df_encoded.reset_index(drop=True), self_encoded_df.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,Wayne Smith,Wendy Diamond,Whoopi Goldberg,Willie Ackerman,Xavier Lambrechts,Yann Barthès,Yolanda Dreher,Zack L. Clark,Zak Catchem,Zuri Hall
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0,0,0,0,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0,0,0,0,0,0,0,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0,0,0,0,0,0,0,0,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0,0,0,0,0,0,0,0,0,0


### Producer


In [36]:
def clean_producer_list(producer_list_str):
    if producer_list_str == "[]":
        return []
    if type(producer_list_str) == float:
        return []
    producer_list_str = producer_list_str.strip("[]").replace("'", "").replace('"', "")
    producer_list = producer_list_str.split(", ")
    return [producer for producer in producer_list if producer]

producer_counts = df["producer"].apply(clean_producer_list).explode().value_counts()
top_producer = producer_counts.head(500).index.tolist()
df["filters_producer"] = df['producer'].apply(clean_producer_list).apply(lambda producer_list: [producer for producer in producer_list if producer in top_producer])
mlb_producer = MultiLabelBinarizer()
producer_vector_df = mlb_producer.fit_transform(df["filters_producer"])
producer_encoded_df = pd.DataFrame(producer_vector_df, columns=mlb_producer.classes_)
df_encoded = pd.concat([df_encoded.reset_index(drop=True), producer_encoded_df.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,Wojciech Jedrkiewicz,Yasuhiro Minamimoto,Yetkin Yagmur,Ylenia Castillo,Yolanda del Val,Yuna Kamiura,Yuri Sudo,Yûji Nunokawa,Zeberiah Newman,Zeynep Üstünipek
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0,0,0,0,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0,0,0,0,0,0,0,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0,0,0,0,0,0,0,0,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0,0,0,0,0,0,0,0,0,0


### Director


In [37]:
def clean_director_list(director_list_str):
    if director_list_str == "[]":
        return []
    if type(director_list_str) == float:
        return []
    director_list_str = director_list_str.strip("[]").replace("'", "").replace('"', "")
    director_list = director_list_str.split(", ")
    return [director for director in director_list if director]

director_counts = df["director"].apply(clean_director_list).explode().value_counts()
top_director = director_counts.head(500).index.tolist()
df["filters_director"] = df['director'].apply(clean_director_list).apply(lambda director_list: [director for director in director_list if director in top_director])
mlb_director = MultiLabelBinarizer()
director_vector_df = mlb_director.fit_transform(df["filters_director"])
director_encoded_df = pd.DataFrame(director_vector_df, columns=mlb_director.classes_)
df_encoded = pd.concat([df_encoded.reset_index(drop=True), director_encoded_df.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,Yann Michel,Yasunori Ide,Yosra Sanhaji,Yvan Lagrange,Zach Kozek,Zion Rubin,Álvaro Curiel,Álvaro Fugulin,Ángel Baviano,Özge Toprak
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0,0,0,0,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0,0,0,0,0,0,0,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0,0,0,0,0,0,0,0,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0,0,0,0,0,0,0,0,0,0


## Transformation de regionlist

In [38]:
def clean_list(list_str):
    if list_str == "[]":
        return []
    if type(list_str) == float:
        return []
    list_str = list_str.strip("[]").replace("'", "").replace('"', "").replace("\\\\N", "")
    list_str = list_str.split(", ")
    return [region for region in list_str if region]

mlb_region = MultiLabelBinarizer()
region_vector_df = mlb_region.fit_transform(df["regionlist"].apply(clean_list))
region_encoded_df = pd.DataFrame(region_vector_df, columns=mlb_region.classes_)
df_encoded = pd.concat([df_encoded.reset_index(drop=True), region_encoded_df.reset_index(drop=True)], axis=1)
df_encoded

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,...,UZ,VE,VN,XAS,XEU,XSA,XWG,XWW,XYU,ZA
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,...,0,0,0,0,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,Drama,,,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,"Action,Mystery",,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,...,0,0,0,0,0,0,0,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,...,0,0,0,0,0,0,0,1,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,Adult,,,...,0,0,0,0,0,0,0,0,0,0


## Final preprocessing

In [39]:
df_final = df_encoded.drop(columns=["actor", "actress", "self", "producer", "director", "regionlist", "genres"])
df_final

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,averagerating,numvotes,seasonnumber,...,UZ,VE,VN,XAS,XEU,XSA,XWG,XWW,XYU,ZA
0,tt12605172,2.0,Episode #1.2,False,2020.0,,,,,1.0,...,0,0,0,0,0,0,0,0,0,0
1,tt12605176,7.0,Shaheb Bibi Golam,False,2016.0,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,tt12605178,1.0,Girl in the Mirror,False,2020.0,,6.0,,,,...,0,0,0,0,0,0,0,0,0,0
3,tt1260518,2.0,Loose Ends,False,2008.0,,,,,18.0,...,0,0,0,0,0,0,0,0,0,0
4,tt12605180,2.0,Episode #1.3,False,2020.0,,,,,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,2.0,Canción de cuna,False,2001.0,,,,,,...,0,0,0,0,0,0,0,0,0,0
9996,tt12624186,9.0,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,5.3,7.0,,...,0,0,0,0,0,0,0,0,0,0
9997,tt1262419,0.0,Sanpei the Fisher Boy,False,2009.0,,118.0,6.1,109.0,,...,0,0,0,0,0,0,0,1,0,0
9998,tt12624194,0.0,Debbie's Time,True,1971.0,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [1]:
import sys
sys.path.append('..')
from scripts.Cleaning import Featurescleaning

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes,seasonnumber,episodenumber,regionnumber,regionlist,actor,self,producer,actress,director,weighted_score
0,tt12605172,tvEpisode,Episode #1.2,False,2020.0,,,"Action,Mystery",,,1.0,2.0,8,"['PT', 'IN', 'FR', 'IT', 'DE', 'JP', '\\N', 'ES']","['Ryoma Takeuchi', 'Kôtarô Yoshida', 'Hayato I...",[],[],['Mikako Tabe'],['Eiichirô Hasumi'],
1,tt12605176,tvSpecial,Shaheb Bibi Golam,False,2016.0,,,Drama,,,,,2,"['\\N', 'BD']","['Omer Ayaz Ony', 'Fs Nayeem']",[],['Hasan Al Mamun Ponir'],['Aparna Ghosh'],['Mir Asaduzzaman Ariyan'],
2,tt12605178,short,Girl in the Mirror,False,2020.0,,6.0,"Music,Short",,,,,2,"['\\N', 'US']",[],[],['Greg L. Hines'],['Vernae Taylor'],['Greg L. Hines'],
3,tt1260518,tvEpisode,Loose Ends,False,2008.0,,,Drama,,,18.0,5.0,1,['\\N'],"['Paul Akl', 'Michael Apgar', 'Alex De Leon', ...",[],[],"['Amanda Garant', 'Angelica Allen', 'Leah Garv...","['Tessa Olson', 'Jenn Carroll']",
4,tt12605180,tvEpisode,Episode #1.3,False,2020.0,,,"Action,Mystery",,,1.0,3.0,8,"['\\N', 'PT', 'IN', 'FR', 'IT', 'DE', 'JP', 'ES']","['Ryoma Takeuchi', 'Kôtarô Yoshida', 'Tatsuya ...",[],[],['Mikako Tabe'],['Eiichirô Hasumi'],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,tvEpisode,Canción de cuna,False,2001.0,,,"Crime,Drama,Thriller",,,,,2,"['\\N', 'MX']","['Roberto Sen', 'Sebastián Rulli', 'Hugo Denis']",[],[],['Tiaré Scanda'],"['Felipe Nájera', 'Francisco Franco Alba']",
9996,tt12624186,videoGame,Go Diego Go! Great Dinosaur Rescue,False,2008.0,,,Adventure,5.3,7.0,,,2,"['\\N', 'US']","['Keeler Sandhaus', 'Keeler Sandhaus', 'Keeler...",[],[],"['Katie McWane', 'Katie McWane', 'Michelle Con...",[],6.945251
9997,tt1262419,movie,Sanpei the Fisher Boy,False,2009.0,,118.0,Drama,6.1,109.0,,,5,"['\\N', 'XWW', 'BR', 'JP', 'CN']","['Kenta Suga', 'Masato Hagiwara', 'Ryûji Katag...",[],['Masatake Kondô'],"['Yû Kashii', 'Cynthia Cheston']",['Yôjirô Takita'],6.872558
9998,tt12624194,movie,Debbie's Time,True,1971.0,,,Adult,,,,,2,"['\\N', 'US']",[],[],['Sven Peters'],[],['Sven Peters'],


In [3]:
import pandas as pd
df = pd.read_csv('../data/all_data_for_10000_lines.csv')

In [4]:
new_df = Featurescleaning(df)
new_df

Unnamed: 0,tconst,titletype,primarytitle,isadult,startyear,genres,averagerating,numvotes,seasonnumber,episodenumber,regionnumber,regionlist,actor,self,producer,actress,director,weighted_score
0,tt12605172,tvEpisode,episode 12,False,2020.0,"Action,Mystery",,,1.0,2.0,8,"[PT, IN, FR, IT, DE, JP, ES]","[Tatsuya Fujiwara, Renji Ishibashi, Masanobu A...",[],[],[Mikako Tabe],[Eiichirô Hasumi],6.949817
1,tt12605176,tvSpecial,shaheb bibi golam,False,2016.0,Drama,,,,,2,[BD],[],[],[],[],[],6.949817
2,tt12605178,short,girl in the mirror,False,2020.0,"Music,Short",,,,,2,[US],[],[],[],[],[],6.949817
3,tt1260518,tvEpisode,loose ends,False,2008.0,Drama,,,18.0,5.0,1,[],[],[],[],[],[],6.949817
4,tt12605180,tvEpisode,episode 13,False,2020.0,"Action,Mystery",,,1.0,3.0,8,"[PT, IN, FR, IT, DE, JP, ES]","[Tatsuya Fujiwara, Renji Ishibashi, Masanobu A...",[],[],[Mikako Tabe],[Eiichirô Hasumi],6.949817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tt12624184,tvEpisode,cancin de cuna,False,2001.0,"Crime,Drama,Thriller",,,,,2,[MX],[],[],[],[],"[Felipe Nájera, Francisco Franco Alba]",6.949817
9996,tt12624186,videoGame,go diego go great dinosaur rescue,False,2008.0,Adventure,5.3,7.0,,,2,[US],[],[],[],[],[],6.945251
9997,tt1262419,movie,sanpei the fisher boy,False,2009.0,Drama,6.1,109.0,,,5,"[XWW, BR, JP, CN]",[],[],[],[],[],6.872558
9998,tt12624194,movie,debbies time,True,1971.0,Adult,,,,,2,[US],[],[],[Sven Peters],[],[Sven Peters],6.949817
