In [1]:
import pandas as pd
from funpymodeling import status, freq_tbl

In [2]:
import ast

In [3]:
data = pd.read_csv("../datasets/adapted_data/adaptedRatingsData.csv", sep = ",")
data

Unnamed: 0,userId,rating,title,imdbId
0,1,4.0,Toy Story,114709
1,5,4.0,Toy Story,114709
2,7,4.5,Toy Story,114709
3,15,2.5,Toy Story,114709
4,17,4.5,Toy Story,114709
...,...,...,...,...
100831,610,2.5,Bloodmoon,118745
100832,610,4.5,Sympathy for the Underdog,66806
100833,610,3.0,Hazard,798722
100834,610,3.5,Blair Witch,1540011


In [4]:
status(data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,userId,0,0.0,0,0.0,610,int64
1,rating,0,0.0,0,0.0,10,float64
2,title,0,0.0,0,0.0,9433,object
3,imdbId,0,0.0,0,0.0,9724,int64


In [5]:
genres_df = pd.read_csv("../datasets/adapted_data/adaptedFeaturesData.csv", sep = ",")
genres_df['genres'] = genres_df['genres'].apply(ast.literal_eval)
genres_df

Unnamed: 0,title,genres,year,imdbId
0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,114709
1,Jumanji,"[Adventure, Children, Fantasy]",1995.0,113497
2,Grumpier Old Men,"[Comedy, Romance]",1995.0,113228
3,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0,114885
4,Father of the Bride Part II,[Comedy],1995.0,113041
...,...,...,...,...
9737,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017.0,5476944
9738,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017.0,5914996
9739,Flint,[Drama],2017.0,6397426
9740,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018.0,8391976


Usually we would discretize values with qcut, but because we are working with a movie recommendation model, and the decade can impact wether a movie is watched or not. We are going to transform the `year`´s values into decades. 

First we check if there are any **nan**s

In [6]:
status(genres_df.drop("genres",axis=1))

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,title,0,0.0,0,0.0,9448,object
1,year,13,0.001334,0,0.0,106,float64
2,imdbId,0,0.0,0,0.0,9742,int64


As we can see, there are unknown years, so we should try to replace those with the `"unknown"` string value. 

In [7]:
decades=[]
for x in genres_df["year"]:
    if str(x) == "nan":
        decades.append("unknown")
    else:
        decades.append(str(x)[0:3]+"0s")

freq_tbl(decades)

Unnamed: 0,0,frequency,percentage,cumulative_perc
0,2000s,2849,0.292445,0.292445
1,1990s,2212,0.227058,0.519503
2,2010s,1931,0.198214,0.717717
3,1980s,1177,0.120817,0.838534
4,1970s,500,0.051324,0.889858
5,1960s,401,0.041162,0.93102
6,1950s,279,0.028639,0.959659
7,1940s,197,0.020222,0.979881
8,1930s,136,0.01396,0.993841
9,1920s,37,0.003798,0.997639


In [8]:
genres_df["decade"] = decades
genres_df = genres_df.drop("year",axis=1)
genres_df

Unnamed: 0,title,genres,imdbId,decade
0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",114709,1990s
1,Jumanji,"[Adventure, Children, Fantasy]",113497,1990s
2,Grumpier Old Men,"[Comedy, Romance]",113228,1990s
3,Waiting to Exhale,"[Comedy, Drama, Romance]",114885,1990s
4,Father of the Bride Part II,[Comedy],113041,1990s
...,...,...,...,...
9737,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",5476944,2010s
9738,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",5914996,2010s
9739,Flint,[Drama],6397426,2010s
9740,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",8391976,2010s


In [9]:
data

Unnamed: 0,userId,rating,title,imdbId
0,1,4.0,Toy Story,114709
1,5,4.0,Toy Story,114709
2,7,4.5,Toy Story,114709
3,15,2.5,Toy Story,114709
4,17,4.5,Toy Story,114709
...,...,...,...,...
100831,610,2.5,Bloodmoon,118745
100832,610,4.5,Sympathy for the Underdog,66806
100833,610,3.0,Hazard,798722
100834,610,3.5,Blair Witch,1540011


Then, we do One Hot Encoding on the `genres` and `decade` columns.

In [10]:
def agregate_genres(genres):
    union=set()
    for genre_list in genres:
        union.update(genre_list)

    return list(union)

In [11]:
genres_df["imdbId"] = genres_df["imdbId"].astype("str")

In [12]:
df = genres_df[["imdbId","title","decade"]].drop_duplicates(subset=["title"])
df

Unnamed: 0,imdbId,title,decade
0,114709,Toy Story,1990s
1,113497,Jumanji,1990s
2,113228,Grumpier Old Men,1990s
3,114885,Waiting to Exhale,1990s
4,113041,Father of the Bride Part II,1990s
...,...,...,...
9737,5476944,Black Butler: Book of the Atlantic,2010s
9738,5914996,No Game No Life: Zero,2010s
9739,6397426,Flint,2010s
9740,8391976,Bungo Stray Dogs: Dead Apple,2010s


In [13]:
genres_df = genres_df.groupby("title",as_index=False).agg({"genres":agregate_genres})
genres_df

Unnamed: 0,title,genres
0,'71,"[Thriller, Drama, War, Action]"
1,'Hellboy': The Seeds of Creation,"[Adventure, Comedy, Documentary, Fantasy, Action]"
2,'Round Midnight,"[Drama, Musical]"
3,'Salem's Lot,"[Mystery, Thriller, Drama, Horror]"
4,'Til There Was You,"[Romance, Drama]"
...,...,...
9443,eXistenZ,"[Thriller, Sci-Fi, Action]"
9444,xXx,"[Thriller, Crime, Action]"
9445,xXx: State of the Union,"[Thriller, Crime, Action]"
9446,¡Three Amigos!,"[Comedy, Western]"


In [14]:
genres_df=pd.merge(genres_df,df, on="title",how="left")
genres_df

Unnamed: 0,title,genres,imdbId,decade
0,'71,"[Thriller, Drama, War, Action]",2614684,2010s
1,'Hellboy': The Seeds of Creation,"[Adventure, Comedy, Documentary, Fantasy, Action]",424755,2000s
2,'Round Midnight,"[Drama, Musical]",90557,1980s
3,'Salem's Lot,"[Mystery, Thriller, Drama, Horror]",355987,2000s
4,'Til There Was You,"[Romance, Drama]",118523,1990s
...,...,...,...,...
9443,eXistenZ,"[Thriller, Sci-Fi, Action]",120907,1990s
9444,xXx,"[Thriller, Crime, Action]",295701,2000s
9445,xXx: State of the Union,"[Thriller, Crime, Action]",329774,2000s
9446,¡Three Amigos!,"[Comedy, Western]",92086,1980s


In [15]:
genres_df = genres_df.drop_duplicates(subset=["title"])
genres_df

Unnamed: 0,title,genres,imdbId,decade
0,'71,"[Thriller, Drama, War, Action]",2614684,2010s
1,'Hellboy': The Seeds of Creation,"[Adventure, Comedy, Documentary, Fantasy, Action]",424755,2000s
2,'Round Midnight,"[Drama, Musical]",90557,1980s
3,'Salem's Lot,"[Mystery, Thriller, Drama, Horror]",355987,2000s
4,'Til There Was You,"[Romance, Drama]",118523,1990s
...,...,...,...,...
9443,eXistenZ,"[Thriller, Sci-Fi, Action]",120907,1990s
9444,xXx,"[Thriller, Crime, Action]",295701,2000s
9445,xXx: State of the Union,"[Thriller, Crime, Action]",329774,2000s
9446,¡Three Amigos!,"[Comedy, Western]",92086,1980s


In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

matrix = mlb.fit_transform(genres_df["genres"])

df_genres = pd.DataFrame(matrix, columns=mlb.classes_)

df_genres

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
1,0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9443,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
9444,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
9445,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
9446,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [17]:
genres_df = pd.concat([genres_df[["imdbId","title","decade"]],df_genres],axis=1)
genres_df

Unnamed: 0,imdbId,title,decade,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,2614684,'71,2010s,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,424755,'Hellboy': The Seeds of Creation,2000s,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,90557,'Round Midnight,1980s,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,355987,'Salem's Lot,2000s,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,118523,'Til There Was You,1990s,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9443,120907,eXistenZ,1990s,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
9444,295701,xXx,2000s,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9445,329774,xXx: State of the Union,2000s,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9446,92086,¡Three Amigos!,1980s,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [18]:
freq_tbl(df["title"])

Unnamed: 0,title,frequency,percentage,cumulative_perc
0,Toy Story,1,0.000106,0.000106
1,Inglorious Bastards (Quel maledetto treno blin...,1,0.000106,0.000212
2,I Think I Love My Wife,1,0.000106,0.000318
3,Premonition,1,0.000106,0.000423
4,Dead Silence,1,0.000106,0.000529
...,...,...,...,...
9443,Norma Rae,1,0.000106,0.999577
9444,Summer Rental,1,0.000106,0.999682
9445,"The 5,000 Fingers of Dr. T",1,0.000106,0.999788
9446,Love Story,1,0.000106,0.999894


Now we do One Hot Encoding in the `decade` column.

In [19]:
decades_category = pd.get_dummies(genres_df["decade"])
decades_category 

Unnamed: 0,1900s,1910s,1920s,1930s,1940s,1950s,1960s,1970s,1980s,1990s,2000s,2010s,unknown
0,False,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9443,False,False,False,False,False,False,False,False,False,True,False,False,False
9444,False,False,False,False,False,False,False,False,False,False,True,False,False
9445,False,False,False,False,False,False,False,False,False,False,True,False,False
9446,False,False,False,False,False,False,False,False,True,False,False,False,False


Once we finished One Hot Encoding, we drop `genres` and `decade` columns and we concat `features_df` with the decades dataframe.

In [20]:
features_df = genres_df.drop(columns = "decade", axis = 1)
features_df

Unnamed: 0,imdbId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,2614684,'71,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,424755,'Hellboy': The Seeds of Creation,0,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,90557,'Round Midnight,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,355987,'Salem's Lot,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,118523,'Til There Was You,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9443,120907,eXistenZ,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
9444,295701,xXx,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9445,329774,xXx: State of the Union,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9446,92086,¡Three Amigos!,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
features_df = pd.concat([features_df,decades_category],axis=1)
features_df

Unnamed: 0,imdbId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,1930s,1940s,1950s,1960s,1970s,1980s,1990s,2000s,2010s,unknown
0,2614684,'71,0,1,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
1,424755,'Hellboy': The Seeds of Creation,0,1,1,0,0,1,0,1,...,False,False,False,False,False,False,False,True,False,False
2,90557,'Round Midnight,0,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
3,355987,'Salem's Lot,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,True,False,False
4,118523,'Til There Was You,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9443,120907,eXistenZ,0,1,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
9444,295701,xXx,0,1,0,0,0,0,1,0,...,False,False,False,False,False,False,False,True,False,False
9445,329774,xXx: State of the Union,0,1,0,0,0,0,1,0,...,False,False,False,False,False,False,False,True,False,False
9446,92086,¡Three Amigos!,0,0,0,0,0,1,0,0,...,False,False,False,False,False,True,False,False,False,False


Finally we save the datasets

In [22]:
features_df.to_csv("../datasets/processed_data/featuresDataset.csv", index = False)
data.to_csv("../datasets/processed_data/processedData.csv", index = False)