In [822]:
import surprise

In [823]:
import tensorflow as tf

In [824]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate, train_test_split
import zipfile
from surprise import Reader, Dataset, SVD
from surprise import accuracy
import random
from random import randint
import re
from itertools import groupby
import pandas as pd
import copy
import operator

<h4> Read in data

In [825]:
# Read data into an array of strings
def read_data(extended = False):   
    if(extended):
        with open('./new_data.data') as f:
            all_movies = f.readlines()

        # Prepare the data to be used in Surprise
        reader = Reader(line_format='user item rating timestamp', sep='\t')
        data = Dataset.load_from_file('./new_data.data', reader=reader)
        
    else:
        with open('./ml-100k/u.data') as f:
            all_movies = f.readlines()

        # Prepare the data to be used in Surprise
        reader = Reader(line_format='user item rating timestamp', sep='\t')
        data = Dataset.load_from_file('./ml-100k/u.data', reader=reader)
    
    return all_movies, data

In [826]:
# all_lines, data = read_data()

In [827]:
def create_dataframe(data):
    data = [ x.replace('\t', ', ').replace('\n', '') for x in data ]

    df = pd.DataFrame([sub.split(",") for sub in data])
    df.rename(columns={0:'userID', 1:'movieID', 2:'rating', 3: 'timestamp'}, 
                         inplace=True)
    df = df.drop(columns=['timestamp'])
    return df

In [828]:
def strip_content(data):
    r_unwanted = re.compile("[\n\t\r]")
    return r_unwanted.sub(",", data)

In [829]:
# df_final = create_dataframe(all_lines)

<h4> Grid search for best params </h4>

In [193]:
param_grid = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}


gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

# print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [161]:
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.01, 0.1, 0.4]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.9518469960105912
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.1}


<h4> SVD algo chosen, with best params obtained from GS

In [831]:
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.1)

In [832]:
# algo = SVD(n_epochs=gs.best_params["rmse"]['n_epochs'], lr_all=gs.best_params["rmse"]['lr_all'], reg_all=gs.best_params["rmse"]['reg_all'])

<h4>Cross validation, 5 folds

In [833]:
# cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

<h4>Create train and test set, apply predictions on test set

In [834]:
def RMSE_predict_train_test(data):
    # sample random trainset and testset
    # test set is made of 25% of the ratings.
    trainset, testset = train_test_split(data, test_size=.25)

    # We'll use the famous SVD algorithm.
    algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Then compute RMSE
    return accuracy.rmse(predictions)

<h4> Training Time
   

In [835]:
import timeit

def training_time(data):
    start = timeit.default_timer()

    trainset = data.build_full_trainset()

    algo.fit(trainset)


    stop = timeit.default_timer()

    print('Time: ', stop - start)  

<h4> Create Dataframe

In [836]:
def predict_scores_dict(data):
    # Build Training set. Needed to fit to create model.
    print("build trainingset")
    trainset = data.build_full_trainset()
    
    print("training algo")
    algo.fit(trainset)
    
    # Get all the user and item IDs
    user_ids = trainset.all_users()
    item_ids = trainset.all_items()
    
    # Create empty list to store predictions
    ratings = {}
    ratings_list = []
    
    print("start prediction")
    # For loop, estimate rating of each user for every movie.
    for user_id in user_ids:
        for item_id in item_ids:
            
            prediction = algo.predict(str(user_id), str(item_id)).est
            ratings['userID'] = int(user_id)
            ratings['movieID'] = int(item_id)
            ratings['rating'] = prediction
            
            ratings_list.append(ratings)
            
            ratings = {}
            
    return ratings_list

In [837]:
import timeit

def prediction_time(data):
    
    start = timeit.default_timer()

    predicted_scores_dict =  predict_scores_dict(data)

    stop = timeit.default_timer()

    print('Time: ', stop - start)  


In [838]:
# prediction_time(data)

<h4> Create extended dataset

In [839]:
def create_data(data):
    trainset = data.build_full_trainset()

    user_ids = trainset.all_users()
    item_ids = trainset.all_items()

    data_list = []
    # Create new movies (168200 in total)
    for movie in range(item_ids[-1]+1, item_ids[-1]*100):
        # For every movie, there will be 100 users rating the movie
        user_generated = [randint(0, user_ids[-1]) for p in range(0, 100)]
        for user in user_generated:
            # Create a random generated score for the movies
            new_data = str(user)+'\t'+str(movie)+'\t'+str(random.randint(1,5))+'\t'+'NaN\n'

            data_list.append(new_data)
    return data_list

In [840]:
# data_newest = create_data(data)

Create the new data (as a list)

In [841]:
# new_data = []

# new_data.extend(all_lines)
# new_data.extend(data_newest)

Check difference in sizes


In [842]:
# len(data_newest)/len(all_lines)

<h4> Search Function </h4>

In [843]:
df_predicted_scores = pd.DataFrame(predicted_scores_dict)

In [844]:
def drop_dups(userID, df_final, predicted_scores_df):

    df_user0_predicted = predicted_scores_df[predicted_scores_df['userID'] == userID]
    # df_user0_predicted.sort_values('rating', ascending=False)
    df_user0_predicted.reset_index(drop=True, inplace=True)
    df_final = df_final.astype('int')
    df_user0_original = df_final[df_final['userID'] == userID]
    df_user0_original.reset_index(drop=True, inplace=True)

    dfs_dictionary = {'DF1':df_user0_predicted,'DF2':df_user0_original}
    df3=pd.concat(dfs_dictionary)
    df3=df3.drop_duplicates(subset=['userID', 'movieID'],keep=False)
    
    return df3

In [845]:
# drop_dups(94, df_final, df_predicted_scores).sort_values('rating', ascending=False)

In [846]:
def best_movies_for_user(userID, amount_of_movies, df_final, predicted_scores_df):
    ascending_ratings = drop_dups(userID, df_final, predicted_scores_df).sort_values('rating', ascending=False)
    return ascending_ratings[0:amount_of_movies]

In [847]:
def create_dict_movie_genres():
        # def determine_genre():
    with open('./ml-100k/u.genre') as f:
        all_genres = f.readlines()
    movie_genres = {}

    for i in range(len(all_genres)-1):
        split_genres = all_genres[i].split('|')
        movie_genres[int(split_genres[1].split("\n")[0])] = split_genres[0]
        
    return movie_genres

In [848]:
def genre_per_movie():
    with open('./ml-100k/u.item') as f:
        movie_details = f.readlines()

    movie_details_dict = {}

    for i in range(len(movie_details)):
        genre_list = ([pos for pos, char in enumerate(movie_details[i][-39:-2]) if char == '1'])
        genre_list = np.array(genre_list)
        genre_list = genre_list//2
        genre_list = genre_list.tolist()
        
        movie_details_dict[movie_details[i].split('|')[1]] = genre_list

    return movie_details_dict

In [849]:
def number_to_movie():
    with open('./ml-100k/u.item') as f:
        movie_details = f.readlines()
        
    movie_details_dict = {}
    
    for i in range(len(movie_details)):
        movie_details_dict[int(movie_details[i].split('|')[0])] = movie_details[i].split('|')[1]
    
    movie_details_dict[0] = 'unknown'
    return movie_details_dict

In [850]:
def insert_into_dataframe(predicted_scores_dict):
    complete_prediction = []
    complete_prediction = pd.DataFrame(complete_prediction)
    for i in range(0, len(predicted_scores_dict), 10000000):
        print("appending",i, i+10000000)
        complete_prediction = complete_prediction.append(predicted_scores_dict[i:i+10000000])
        
    return complete_prediction

In [851]:
def prepare_data():
    all_lines, data = read_data()
    
    print("create df_final")
    df_final = create_dataframe(all_lines).astype('int')
    
    print("predict scores")
    predicted_scores_dict =  predict_scores_dict(data)
    
    df_predicted_scores = insert_into_dataframe(predicted_scores_dict)
    
    return df_final, df_predicted_scores

In [852]:
def convert_to_titles(best_movies, number_to_movie):
    i=0
    for movieID in best_movies['movieID']:
        best_movies['movieID'][i] = number_to_movie[int(movieID)]
        i+=1
        
    return best_movies

In [933]:
def best_genre_for_user(userID, amount_of_movies, genre_per_movie, number_to_movie):
    best_movies = best_movies_for_user(userID,amount_of_movies,df_final, df_predicted_scores)    

    list_of_genres = []
    for movieID in best_movies['movieID']:
        list_of_genres.extend(genre_per_movie[number_to_movie[int(movieID)]])

    sorted_genres = {value: len(list(freq)) for value, freq in groupby(sorted(list_of_genres))}
    
    best_genre = max(sorted_genres, key=sorted_genres.get)

    return int_to_genre[best_genre]

In [1111]:
def create_genre_cols(best_movies, movie_genres):
    for i in range(len(movie_genres)):
        best_movies[movie_genres[i]] = int(0)
    return best_movies

In [1249]:
def to_onehot(data):
    for int_genre in movie_to_genre[number_to_mov[data['movieID']]]:
        data[int_to_genre[int_genre]] = int(1)
    return data

In [1251]:
def display_specific_genre(data, genre, amount):
    best_movies_genre = best_movies_for_genre(data,genre, amount)
    best_movies_genre = convert_to_titles(best_movies_genre, number_to_mov) 
    return best_movies_genre[['userID', 'movieID', 'rating', genre]]

In [None]:
def best_movies_for_genre(data, genre, amount):
    return data[data[genre]==1].sort_values('rating', ascending = False)[0:amount]

In [1419]:
def convert_to_titles(best_movies, number_to_movie):
    i=0
    for movieID in best_movies['movieID']:
        best_movies['movieID'].iloc[i] = number_to_mov[int(movieID)]
        i+=1
        
    return best_movies

In [None]:
def get_avg_scores_for_genre(data, genre):
    data_genre = data[data[genre] == 1]
    create_dict = {}
    for movieID in data_genre['movieID']:
        if(movieID in create_dict):
            continue
        else:

            sum_of_rating = sum(data_genre[data_genre['movieID']==movieID]['rating'])
            amount_of_ratings = len(data_genre[data_genre['movieID']==movieID]['rating'])

            avg_rating = sum_of_rating/amount_of_ratings
            create_dict[(movieID)] = avg_rating
    return create_dict

In [1600]:
def get_best_movies_for_genre(data, genre, amount):
    avg_scores = get_avg_scores_for_genre(data, genre)
    df_avg_scores  = pd.DataFrame.from_records([avg_scores]).transpose()
    df_avg_scores.columns = ['rating']
    df_avg_scores['movieID'] = df_avg_scores.index
    df_avg_scores = df_avg_scores.sort_values('rating',ascending=False).iloc[0:amount]
    df_avg_scores = convert_to_titles(df_avg_scores, amount)
    return df_avg_scores

In [1601]:
def get_best_movies_for_genre2(data, genre, amount):
    avg_scores = get_avg_scores_for_genre(data, 'Action')
    avg_scores = sorted(avg_scores.items(), key=lambda kv: kv[1])
    avg_scores.reverse()
    avg_scores[0:amount]
    avg_scores = collections.OrderedDict(avg_scores)
    df_avg_scores  = pd.DataFrame.from_records([avg_scores]).transpose()
    df_avg_scores.columns = ['rating']
    df_avg_scores['movieID'] = df_avg_scores.index
    df_avg_scores = df_avg_scores.sort_values('rating',ascending=False).iloc[0:amount]
    df_avg_scores = convert_to_titles(df_avg_scores, amount)
    return df_avg_scores

<h3> Collaberative Filtering Implementation Examples </h3>

In [1571]:
# Convert int value to movie genres
int_to_genre = create_dict_movie_genres()
# Convert movie title to list of genres
movie_to_genre = genre_per_movie()
# Convert int value (original movie representation) to movie name
number_to_mov = number_to_movie()

In [1572]:
# Create DataFrames with predicted scores and 
df_final, df_predicted_scores = prepare_data()

create df_final
predict scores
build trainingset
training algo
start prediction
appending 0 10000000


In [1575]:
df_predicted_scores

Unnamed: 0,userID,movieID,rating
0,0,0,3.529860
1,0,1,3.891188
2,0,2,3.381856
3,0,3,3.179301
4,0,4,3.569666
...,...,...,...
1586121,942,1677,3.803723
1586122,942,1678,3.835867
1586123,942,1679,4.002668
1586124,942,1680,3.839924


In [1221]:
best_movies = best_movies_for_user(130,10,df_final, df_predicted_scores)
best_movies = best_movies.astype('str')
best_movies = convert_to_titles(best_movies, number_to_mov)

In [1576]:
best_movies

Unnamed: 0,Unnamed: 1,userID,movieID,rating
DF1,318,130,Schindler's List (1993),5.0
DF1,169,130,"Wrong Trousers, The (1993)",5.0
DF1,408,130,"Close Shave, A (1995)",5.0
DF1,483,130,Casablanca (1942),4.959193584271117
DF1,603,130,Rear Window (1954),4.921488596159989
DF1,178,130,12 Angry Men (1957),4.920400213892882
DF1,480,130,North by Northwest (1959),4.894424521050037
DF1,114,130,Wallace & Gromit: The Best of Aardman Animatio...,4.889080246577413
DF1,513,130,"Third Man, The (1949)",4.879865672655963
DF1,657,130,"Manchurian Candidate, The (1962)",4.852145285353883


In [1577]:
best_genre_for_user(148, 10, movie_to_genre, number_to_mov)

'Drama'

In [1224]:
best_movies_extended = create_genre_cols(df_predicted_scores, int_to_genre)

In [1163]:
data = best_movies_extended.apply(to_onehot, axis = 1)

In [1342]:
create_avg_scores_dict = get_avg_scores_for_movie()

In [1615]:
get_best_movies_for_genre(data,'Comedy',5)

Unnamed: 0,rating,movieID
408.0,4.467038,"Close Shave, A (1995)"
169.0,4.42352,"Wrong Trousers, The (1993)"
480.0,4.269971,North by Northwest (1959)
251.0,4.168339,Shall We Dance? (1996)
316.0,4.162157,As Good As It Gets (1997)


In [1611]:
start = timeit.default_timer()
test = get_best_movies_for_genre(data, 'Action', 1000)
stop = timeit.default_timer()
print(stop-start)

start=timeit.default_timer()
test2 = get_best_movies_for_genre2(data, 'Action', 1000)
stop = timeit.default_timer()
print(stop - start)

1.4644015000085346
1.6314131999970414
