# Exploration done on the EPFL recommender system

In [None]:
from IPython.core.display import display, HTML
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
import configparser
import mysql.connector as sql
import pandas as pd
import numpy as np
%config InlineBackend.figure_format = 'retina'

# Read the confidential token.
credentials = configparser.ConfigParser()
credentials.read('credentials.ini')
db_connection = sql.connect(host=credentials.get('mysql', 'url'),
                            database='semester_project_romain',
                            user=credentials.get('mysql', 'username'),
                            password=credentials.get('mysql', 'password'))

In [None]:
# PlanType = "PLAN_EXAMINE" ?????
all_info = """
            select distinct 
                PersonID, 
                PedagogicalCode, 
                StudyDomain, 
                UnitName, 
                UnitID,
                SubjectName, 
                SubjectID,
                SectionName, 
                YearName,
                CourseCode
            from 
                course_enrolments_with_info 
            where 
                UnitName like "%ommunication%" 
                and 
                LevelName = "Master"
                and (YearName = "2010-2011"
                or YearName = "2011-2012"
                or YearName = "2012-2013"
                or YearName = "2013-2014"
                or YearName = "2014-2015"
                or YearName = "2015-2016")
            """
all_df = pd.read_sql(all_info, con=db_connection)
all_df

In [None]:
current_courses = """
            select distinct 
                PedagogicalCode, 
                SubjectName, 
                SubjectID,
                YearName
            from 
                course_enrolments_with_info 
            where 
                UnitName like "%ommunication%" 
                and 
                LevelName = "Master"
                and left(PedagogicalCode, 2) = "MA"
                and YearName = "2015-2016"
            """
current_courses_df = pd.read_sql(current_courses, con=db_connection)
# These are the current courses (latest data) given in syscom @EPFL
current_courses_df

### Cleaning up
We need to find a way to get a cleaner dataset of courses, a lot of them are not usefull or outdated and should not be recommended.
Should we weed them out by hand for now ?

In [None]:
# Taking a look at the most taken courses
registrations_df = all_df.set_index(['SubjectName', 'YearName'])
all_df_registrations = registrations_df.groupby(['SubjectName', 'YearName']).size()

registrations_df['Registration'] = all_df_registrations
registrations_df = registrations_df.reset_index()
# Pick only courses that have a study domain (removes bullshit)
# such as Projects and groups, minors etc
registrations_df = registrations_df[~registrations_df.StudyDomain.isnull()]
# Remove the SHS courses
registrations_df = registrations_df[~(registrations_df.StudyDomain == "Programme Sciences humaines et sociales")]
# Removes non important information
registrations_df = registrations_df.drop([
    'PersonID', "StudyDomain", "SectionName", "PedagogicalCode",
    "CourseCode"], axis=1)
registrations_df = registrations_df.drop_duplicates()
registrations_df = registrations_df.set_index(['SubjectName', 'YearName']).sort_index()
registrations = registrations_df.sort_values(ascending=False, by='Registration')

# Latest data registrations
registrations.xs('2015-2016', level='YearName')

In [None]:
# Found courses that should be removed:
courses_to_remove = [
    "Admission année sup.",
    "Projet de master en systèmes de communication",
    "SHS : Introduction au projet",
    "Cycle master",
    "Projet de Master",
    "Groupe Core courses & options",
    "Bloc Projets et SHS",
    "Groupe 2 : Options",
    "Master SC",
    "Mineur",
    "Groupe 1",
    "Projet en systèmes de communication II",
    "Cours réservés spécifiquement aux étudiants s'inscrivant pour le mineur Area and Cultural Studies",
    "SHS : Projet",
    "Optional project in communication systems",
]

### Creating the binary matrix

In [None]:
courses_matrix = all_df[['PersonID', 'SubjectName']]
courses_matrix = courses_matrix.drop_duplicates()
courses_matrix = courses_matrix.set_index(['PersonID', 'SubjectName'])

def series_to_integers(series):
    "Converts a whole series to integers"
    return pd.to_numeric(series, downcast='integer')

# If the course was taken, set it to 1
courses_matrix['joined'] = 1
courses_matrix = courses_matrix.reset_index().pivot(index='PersonID', columns='SubjectName', values='joined')
courses_matrix = courses_matrix.fillna(0)
courses_matrix = courses_matrix.apply(series_to_integers)

### Splitting test/train

In [None]:
# Total matrix, use it to split train and test
registrations_total = courses_matrix.unstack().reset_index()
registrations_total = registrations_total.rename(columns={0: "Taken"})
#test_set = registrations_total.sample(frac=0.2, replace=False)
#train_set = registrations_total - test_set
train, test = train_test_split(registrations_total, test_size=0.2)
print("We have {} rows in total".format(len(registrations_total)))
print("Train: {} rows".format(len(train)))
print("Test: {} rows".format(len(test)))

In [None]:
# Back to training matrix
courses_matrix = train.pivot(index='PersonID', columns='SubjectName', values="Taken")
courses_matrix = courses_matrix.fillna(0)
courses_matrix = courses_matrix.apply(series_to_integers)
courses_matrix

### Collaborative filtering through correlation matrix
We use the Jaccard score to compute the similarity matrix and then apply on the binary matrix to predict good courses to take. 

In [None]:
# Using Jaccard distance
corr_courses_matrix = squareform(1 - pdist(courses_matrix.T, 'jaccard'))

# Using Pearson correlation
#corr_courses_matrix = np.corrcoef(courses_matrix.T)  

course_index = courses_matrix.columns

def get_course_similarity(course):  
    '''Returns correlation vector for a course'''
    course_idx = list(course_index).index(course)
    return corr_courses_matrix[course_idx]

def get_course_recommendations(user_courses):  
    '''Given a set of courses, it returns all the courses with their similarity score'''
    course_similarities = np.zeros(corr_courses_matrix.shape[0])
    for course_id in user_courses:
        course_similarities = course_similarities + get_course_similarity(course_id)
    similarities_df = pd.DataFrame({
        'course_title': course_index,
        'sum_similarity': course_similarities
        })
    similarities_df = similarities_df[similarities_df.course_title.isin(user_courses)]
    similarities_df = similarities_df.sort_values(by=['sum_similarity'], ascending=False)
    return similarities_df

def recommend_row(user_row):
    sample_user = 1801481982
    sample_user_courses = list(user_row.sort_values(ascending=False).index)
    recommendations = get_course_recommendations(sample_user_courses)
    
# Le'ts try it out for a random user
#sample_user = 1801481982
sample_user = 1892490156
sample_user_courses = list(courses_matrix.loc[sample_user].sort_values(ascending=False).index)
recommendations = get_course_recommendations(sample_user_courses)

row = courses_matrix.loc[sample_user]
# We get the top 20 recommended courses
print("The user {} has the following courses: \n{}\
      \nso we recommend him to pick:"
      .format(sample_user,
              "\n- ".join(list(row[row > 0].index))))
# Only accept recommendations from latest data courses (2015-2016)
accepted_recommendations = recommendations[recommendations.course_title.isin(current_courses_df.SubjectName)]
# Removing bloat courses
accepted_recommendations = accepted_recommendations[~accepted_recommendations.isin(courses_to_remove)]
accepted_recommendations = accepted_recommendations.dropna()

# Normalizing the results by dividing by the maximum of the summed similarities
accepted_recommendations.sum_similarity = accepted_recommendations.sum_similarity / accepted_recommendations.sum_similarity.max()
accepted_recommendations

# Code used to predict one course:
#favoured_course = 'Distributed information systems'
#favoured_course_index = list(courses_index).index(favoured_course)
#P = corr_courses_matrix[favoured_course_index]

# list the courses with a high correlation with the favoured course
#print(list(courses_index[(P>0.3) & (P<1.0)])  

In [None]:
# Trying Surprise with KNN
from collections import defaultdict
from surprise.dataset import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, Dataset, evaluate, print_perf, accuracy

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def k_fold(data, algorithm):
    errors = []
    for trainset, testset in data.folds():
        # shut up while doing it
        from IPython.utils import io
        with io.capture_output() as captured:
            # train and test algorithm.
            algorithm.train(trainset)
        predictions = algorithm.test(testset)
        errors.append(accuracy.rmse(predictions, verbose=False))
    return errors

# Drop bullshit data
used_data = registrations_total
used_data.SubjectName = used_data.SubjectName[~used_data.SubjectName.isin(courses_to_remove)]

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 1))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(used_data[['PersonID', 'SubjectName', 'Taken']], reader)

In [None]:
import matplotlib.pyplot as plt 

# how to make it shut up ?
# test and train with verbose=False ?
def try_knn(KNN_algo, sim_options, data):
    print("training: 0.00%", end='\r')
    k_values = np.arange(10, 60)
    results = []
    for k in k_values:
        algo = KNN_algo(k=k, sim_options=sim_options)
        errors = k_fold(data, algo)
        results.append((k, np.mean(errors)))
        print("training: {:.2f}%".format((k + 1 - k_values[0]) / len(k_values) * 100), end='\r')
    print("\ndone.")
    return results

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True  # compute  similarities between items
               }

knn_basic_results = try_knn(KNNBasic, sim_options, data)
print("RMSE for KNN Basic:")
plt.plot(*zip(*knn_basic_results))
plt.show()

best_rmse_tuple = sorted(knn_basic_results, key=lambda x: x[1])[0]
print("The best RMSE is: {} for k = {}".format(best_rmse_tuple[1], best_rmse_tuple[0]))

In [None]:
knn_means_results = try_knn(KNNWithMeans, sim_options, data)
print("RMSE for KNN with Means:")
plt.plot(*zip(*knn_means_results))
plt.show()

best_rmse_tuple = sorted(knn_means_results, key=lambda x: x[1])[0]
print("The best RMSE is: {} for k = {}".format(best_rmse_tuple[1], best_rmse_tuple[0]))

In [None]:
knn_baseline_results = try_knn(KNNBaseline, sim_options, data)
print("RMSE for KNN Baseline:")
plt.plot(*zip(*knn_baseline_results))
plt.show()

best_rmse_tuple = sorted(knn_baseline_results, key=lambda x: x[1])[0]
print("The best RMSE is: {} for k = {}".format(best_rmse_tuple[1], best_rmse_tuple[0]))

In [None]:
knn_zscore_results = try_knn(KNNWithZScore, sim_options, data)
print("RMSE for KNN Z Score:")
plt.plot(*zip(*knn_zscore_results))
plt.show()

best_rmse_tuple = sorted(knn_zscore_results, key=lambda x: x[1])[0]
print("The best RMSE is: {} for k = {}".format(best_rmse_tuple[1], best_rmse_tuple[0]))

In [None]:
# Best one yet
algo = KNNBaseline(k=22, sim_options=sim_options)
trainset = data.build_full_trainset()
algo.train(trainset)
testset = trainset.build_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])
    
# Evaluate performances of our algorithm on the dataset.
#perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
#print_perf(perf)

In [None]:
print("Taken courses: ", "\n- ".join(list(courses_matrix.loc[946926890][courses_matrix.loc[946926890] == 1].index)))
print("predictions: ", ['Pattern classification and machine learning', 'TCP/IP networking', 'Mobile networks', 'Mineur : Management, technologie et entrepreneuriat', 'Cryptography and security', np.nan, np.nan, np.nan, np.nan, np.nan])

In [None]:
# Trying with NMF
from surprise import NMF
algo = NMF(biased=True, verbose=True)
errors = k_fold(data, algo)
np.mean(errors)

In [None]:
# Trying with SlopeOne
from surprise import SlopeOne
algo = SlopeOne()
errors = k_fold(data, algo)
np.mean(errors)

In [None]:
# Trying with co-clustering
from surprise import CoClustering
algo = CoClustering(n_cltr_u=1, n_cltr_i=8, n_epochs=50, verbose=True)
errors = k_fold(data, algo)
np.mean(errors)

# try out the following:
Jensen–Shannon divergence

nn winner take all

self orga maps

In [None]:
# Trying the Netflixprize solutions

In [None]:
# Find other ones

In [None]:
# Feature Engineering