In [2]:
import pandas as pd
from surprise.model_selection import train_test_split
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering
from surprise.accuracy import rmse

In [None]:
df = pd.read_csv('../data/movies/ratings.csv')

In [None]:
df.drop('timestamp',axis=1,inplace=True)

In [None]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df,reader)

In [None]:
train, test = train_test_split(data, test_size=0.2)

## Test Surprise Algorithms

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

In [None]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [None]:
surprise_results

## Using Surprise SVD++

In [None]:
svdpp_options = {} # adjust hyper-params here
algo = SVDpp(SVDpp_options=svdpp_options)

In [None]:
predictions = algo.fit(train).test(test)

In [None]:
accuracy.rmse(predictions)

## Merging Nick & Kayla's Movie Rating Data

In [3]:
teach = pd.read_csv('../data/movies/g90_movie_ratings.csv')

In [4]:
teach.head()

Unnamed: 0,Name,21 Jump Street,28 Days Later,A Beautiful Mind,About Time,Air Force One,Alien,Aliens,American Gangster,Annihilation,...,The Godfather,The Intouchables,The Life Aquatic w. Steve Zissou,The Martian,The Pianist,Thor: Ragnarok,Three Billboards Outside Ebbing Missouri,Tron: Legacy,War Games,Wayne's World
0,Kayla Thomas,,,4.0,10.0,3.0,,,4.0,,...,,,3.0,,6.0,,7.0,,,2.0
1,Nick Jacobsohn,7.0,,,,,10.0,8.0,,8.0,...,,,,,,,8.0,,,7.0
2,Andrew,6.0,,,,5.0,6.0,6.0,7.0,,...,,,,,,8.0,7.0,,,
3,John Herr,,,8.0,,,,,,,...,8.0,,,,,,,,,
4,Jarred Bultema,6.0,6.0,8.0,,5.0,5.0,,7.0,3.0,...,,,6.0,8.0,6.0,10.0,,5.0,5.0,7.0


In [8]:
teach = teach[:2]

In [9]:
teach

Unnamed: 0,Name,21 Jump Street,28 Days Later,A Beautiful Mind,About Time,Air Force One,Alien,Aliens,American Gangster,Annihilation,...,The Godfather,The Intouchables,The Life Aquatic w. Steve Zissou,The Martian,The Pianist,Thor: Ragnarok,Three Billboards Outside Ebbing Missouri,Tron: Legacy,War Games,Wayne's World
0,Kayla Thomas,,,4.0,10.0,3.0,,,4.0,,...,,,3.0,,6.0,,7.0,,,2.0
1,Nick Jacobsohn,7.0,,,,,10.0,8.0,,8.0,...,,,,,,,8.0,,,7.0


In [20]:
class_rat = pd.read_csv('../data/movies/g99_movie_rating.csv')

In [21]:
class_rat.head()

Unnamed: 0,Name,21 Jump Street,28 Days Later,A Beautiful Mind,About Time,Air Force One,Alien,Aliens,American Gangster,Annihilation,...,The Godfather,The Intouchables,The Life Aquatic w. Steve Zissou,The Martian,The Pianist,Thor: Ragnarok,Three Billboards Outside Ebbing Missouri,Tron: Legacy,War Games,Wayne's World
0,Kayla Thomas,5.0,,4.0,10.0,3.0,,,4.0,,...,,,3.0,,6.0,,7.0,,,2.0
1,Alex Cross,5.0,4.0,8.0,,,7.0,,,,...,,,5.0,,,9.0,,,,
2,Alex Rook,,6.0,,,,8.0,6.0,,,...,,,,8.0,,8.0,,,7.0,6.0
3,Dan Reiff,6.0,,,,,,,,,...,10.0,,,5.0,,4.0,4.0,,,
4,Dan Riggi,7.0,3.0,7.0,,7.0,8.0,6.0,9.0,,...,10.0,,5.0,7.0,,7.0,,3.0,,4.0


In [25]:
cols = list(teach.columns.values)

In [29]:
teach.set_index(teach['Name'],inplace=True)

In [34]:
teach.drop(columns='Name',inplace=True)

In [40]:
teach.fillna(0,inplace=True)

In [42]:
teach = teach/2

In [43]:
teach

Unnamed: 0_level_0,21 Jump Street,28 Days Later,A Beautiful Mind,About Time,Air Force One,Alien,Aliens,American Gangster,Annihilation,Apollo 13,...,The Godfather,The Intouchables,The Life Aquatic w. Steve Zissou,The Martian,The Pianist,Thor: Ragnarok,Three Billboards Outside Ebbing Missouri,Tron: Legacy,War Games,Wayne's World
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kayla Thomas,0.0,0.0,2.0,5.0,1.5,0.0,0.0,2.0,0.0,2.0,...,0.0,0.0,1.5,0.0,3.0,0.0,3.5,0.0,0.0,1.0
Nick Jacobsohn,3.5,0.0,0.0,0.0,0.0,5.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.5


In [49]:
teach.index[0]

'Kayla Thomas'

In [75]:
movies[movies['title'] == cols[3]]

Unnamed: 0,movieId,title,genres


In [98]:
names_lst = []
movies_lst = []
ratings_lst = []

for idx, row in enumerate(teach.values):
    for movie, val in enumerate(row):
        if val > 0:
            #input = ','.join(name,title,val)
            names_lst.append(teach.index[idx])
            movies_lst.append(cols[movie+1])
            ratings_lst.append(val)
            

In [179]:
new_df = pd.DataFrame({'user':names_lst,'movie':movies_lst,'rating':ratings_lst})

In [180]:
def f(row):
    if row['user'] == 'Kayla Thomas':
        val = 99999998
    elif row['user'] == 'Nick Jacobsohn':
        val = 99999999
    return val

In [181]:
new_df['userId'] = new_df.apply(f, axis=1)

In [182]:
new_df

Unnamed: 0,user,movie,rating,userId
0,Kayla Thomas,A Beautiful Mind,2.0,99999998
1,Kayla Thomas,About Time,5.0,99999998
2,Kayla Thomas,Air Force One,1.5,99999998
3,Kayla Thomas,American Gangster,2.0,99999998
4,Kayla Thomas,Apollo 13,2.0,99999998
5,Kayla Thomas,Back to the Future,2.5,99999998
6,Kayla Thomas,Count of Monte Cristo,4.0,99999998
7,Kayla Thomas,Eternal Sunshine of the Spotless Mind,1.0,99999998
8,Kayla Thomas,Girl with a Pearl Earring,1.5,99999998
9,Kayla Thomas,Gladiator,2.5,99999998


In [183]:
teach_ratings = new_df.merge(movies,how='left',left_on='movie',right_on='title')

In [184]:
teach_ratings.drop(['genres','movie','title','user'],axis=1,inplace=True)

In [185]:
teach_ratings.dropna(axis=0,inplace=True)

In [186]:
teach_ratings['movieId'] = [int(ID) for ID in teach_ratings['movieId']]

In [197]:
teach_ratings = teach_ratings[['userId','movieId','rating']]

In [198]:
teach_ratings

Unnamed: 0,userId,movieId,rating
1,99999998,104374,5.0
2,99999998,1608,1.5
3,99999998,55765,2.0
4,99999998,150,2.0
5,99999998,1270,2.5
7,99999998,7361,1.0
8,99999998,7151,1.5
9,99999998,3578,2.5
10,99999998,8132,2.5
11,99999998,1704,3.0


In [10]:
ratings = pd.read_csv('../data/movies/ratings.csv')

In [195]:
ratings.drop('timestamp',axis=1,inplace=True)

In [211]:
ratings = ratings.append(teach_ratings)

In [212]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
5,1,1263,2.0
6,1,1287,2.0
7,1,1293,2.0
8,1,1339,3.5
9,1,1343,2.0


In [213]:
ratings.to_csv('ratings_plus.csv',index=False)

In [214]:
ratings_kaylanick = pd.read_csv('../data/movies/ratings_plus.csv')

In [216]:
ratings_kaylanick.tail()

Unnamed: 0,userId,movieId,rating
100036,99999999,3527,1.5
100037,99999999,296,4.5
100038,99999999,2028,1.5
100039,99999999,8874,4.0
100040,99999999,3253,3.5


In [122]:
movies = pd.read_csv('../data/movies/movies.csv')

In [123]:
movies['title'] = [title[:-7] for title in movies['title']]

In [124]:
movies[:10]

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy
5,6,Heat,Action|Crime|Thriller
6,7,Sabrina,Comedy|Romance
7,8,Tom and Huck,Adventure|Children
8,9,Sudden Death,Action
9,10,GoldenEye,Action|Adventure|Thriller


In [97]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy
5,6,Heat,Action|Crime|Thriller
6,7,Sabrina,Comedy|Romance
7,8,Tom and Huck,Adventure|Children
8,9,Sudden Death,Action
9,10,GoldenEye,Action|Adventure|Thriller


In [96]:
movies[movies['title']=='About Time']

Unnamed: 0,movieId,title,genres


In [12]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
links = pd.read_csv('../data/movies/links.csv')

In [16]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [18]:
tags = pd.read_csv('../data/movies/tags.csv')

In [19]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765
