## Initial Commit

In [1]:
import surprise
from surprise.prediction_algorithms import *
import pandas as pd
import numpy as np
import datetime as dt

In [41]:
link_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')

In [42]:
link_df


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [43]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [44]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [45]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [46]:
tags_df['userId'] = tags_df['userId'].apply(str)
tags_df['movieId'] = tags_df['movieId'].apply(str)
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   object
 1   movieId    3683 non-null   object
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 115.2+ KB


In [49]:
tags_df['userId_movieId'] = tags_df['userId'] + tags_df['movieId']
tags_df

Unnamed: 0,userId,movieId,tag,timestamp,userId_movieId
0,2,60756,funny,1445714994,260756
1,2,60756,Highly quotable,1445714996,260756
2,2,60756,will ferrell,1445714992,260756
3,2,89774,Boxing story,1445715207,289774
4,2,89774,MMA,1445715200,289774
...,...,...,...,...,...
3678,606,7382,for katie,1171234019,6067382
3679,606,7936,austere,1173392334,6067936
3680,610,3265,gun fu,1493843984,6103265
3681,610,3265,heroic bloodshed,1493843978,6103265


In [53]:
tags_df['userId'] = tags_df['userId'].apply(int)
tags_df['movieId'] = tags_df['movieId'].apply(int)
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userId          3683 non-null   int64 
 1   movieId         3683 non-null   int64 
 2   tag             3683 non-null   object
 3   timestamp       3683 non-null   int64 
 4   userId_movieId  3683 non-null   object
dtypes: int64(3), object(2)
memory usage: 144.0+ KB


In [55]:
tags_df['userId_movieId'][tags_df['userId_movieId'].duplicated(keep=False)]

0        260756
1        260756
2        260756
3        289774
4        289774
         ...   
3669    5992959
3670    5992959
3671    5992959
3680    6103265
3681    6103265
Name: userId_movieId, Length: 2510, dtype: object

In [64]:
#https://stackoverflow.com/questions/44397210/drop-duplicates-and-add-values-pandas

f = {c: ' '.join if c == 'tag' else 'first' for c in ['userId', 'movieId', 'timestamp', 'tag']}

tags_df = tags_df.groupby(
    tags_df.userId_movieId.astype(str), sort=False
).agg(f).reset_index().eval(
    'userId_movieId = @pd.to_numeric(userId_movieId, "coerce").values',
    inplace=False
)

In [65]:
tags_df

Unnamed: 0,userId_movieId,userId,movieId,timestamp,tag
0,260756,2,60756,1445714994,funny Highly quotable will ferrell
1,289774,2,89774,1445715207,Boxing story MMA Tom Hardy
2,2106782,2,106782,1445715054,drugs Leonardo DiCaprio Martin Scorsese
3,748516,7,48516,1169687325,way too long
4,18431,18,431,1462138765,Al Pacino gangster mafia
...,...,...,...,...,...
1770,6066107,606,6107,1178473747,World War II
1771,6067382,606,7382,1171234019,for katie
1772,6067936,606,7936,1173392334,austere
1773,6103265,610,3265,1493843984,gun fu heroic bloodshed


In [66]:
ratings_movie_df = ratings_df.merge(movies_df, on  = 'movieId')
ratings_movie_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [67]:
ratings_movie_tags_df = ratings_movie_df.merge(tags_df, left_on = ['movieId', 'userId'], right_on = ['movieId', 'userId'], how = 'left')

In [68]:
ratings_movie_tags_df

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,userId_movieId,timestamp_y,tag
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,
...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller,,,
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama,,,
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller,,,
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller,,,


In [70]:
df_complete = ratings_movie_tags_df.merge(link_df,  how = 'left', on = 'movieId')

In [71]:
df_complete

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,userId_movieId,timestamp_y,tag,imdbId,tmdbId
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,,114709,862.0
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,,114709,862.0
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,,114709,862.0
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,,114709,862.0
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,,114709,862.0
...,...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller,,,,118745,30948.0
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama,,,,66806,90351.0
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller,,,,798722,70193.0
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller,,,,1540011,351211.0


In [75]:
df_complete = df_complete.drop(columns = 'userId_movieId')

In [76]:
df_complete

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,timestamp_y,tag,imdbId,tmdbId
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,114709,862.0
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,114709,862.0
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,114709,862.0
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,114709,862.0
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,114709,862.0
...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller,,,118745,30948.0
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama,,,66806,90351.0
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller,,,798722,70193.0
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller,,,1540011,351211.0
