# Exploration done on the EPFL recommender system

In [None]:
from IPython.core.display import display, HTML
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
import configparser
import mysql.connector as sql
import pandas as pd
import numpy as np
%config InlineBackend.figure_format = 'retina'

# Read the confidential token.
credentials = configparser.ConfigParser()
credentials.read('credentials.ini')
db_connection = sql.connect(host=credentials.get('mysql', 'url'),
                            database='semester_project_romain',
                            user=credentials.get('mysql', 'username'),
                            password=credentials.get('mysql', 'password'))

In [None]:
# Found courses that should be removed:
courses_to_remove = [
    "Admission année sup.",
    "Projet de master en systèmes de communication",
    "SHS : Introduction au projet",
    "Cycle master",
    "Projet de Master",
    "Groupe Core courses & options",
    "Bloc Projets et SHS",
    "Groupe 2 : Options",
    "Master SC",
    "Mineur",
    "Groupe 1",
    "Projet en systèmes de communication II",
    "Cours réservés spécifiquement aux étudiants s'inscrivant pour le mineur Area and Cultural Studies",
    "SHS : Projet",
    "Optional project in communication systems",
    "Mineur : Neurosciences computationnelles",
    "Stage d'ingénieur crédité avec le PDM (master en Systèmes de communication)",
]

domains_to_remove = [
    "Humanities and social sciences",
    "Programme Sciences humaines et sociales",
]

In [None]:
# PlanType = "PLAN_EXAMINE" ?????
all_info = """
            select distinct 
                PersonID, 
                PedagogicalCode, 
                StudyDomain, 
                UnitName, 
                UnitID,
                SubjectName, 
                SubjectID,
                SectionName, 
                YearName,
                CourseCode
            from 
                course_enrolments_with_info 
            where 
                UnitName like "%ommunication%" 
                and 
                LevelName = "Master"
                and (YearName = "2010-2011"
                or YearName = "2011-2012"
                or YearName = "2012-2013"
                or YearName = "2013-2014"
                or YearName = "2014-2015"
                or YearName = "2015-2016")
            """
all_df = pd.read_sql(all_info, con=db_connection)
all_df = all_df[~all_df.SubjectName.isin(courses_to_remove)]
# Removing the SHS courses
all_df = all_df[~(all_df.StudyDomain.isin(domains_to_remove))]
all_df

In [None]:
current_courses = """
            select distinct 
                PedagogicalCode, 
                SubjectName, 
                SubjectID,
                StudyDomain,
                YearName
            from 
                course_enrolments_with_info 
            where 
                UnitName like "%ommunication%" 
                and 
                LevelName = "Master"
                and left(PedagogicalCode, 2) = "MA"
                and YearName = "2015-2016"
            """
current_courses_df = pd.read_sql(current_courses, con=db_connection)
# These are the current courses (latest data) given in syscom @EPFL
current_courses_df = current_courses_df[~current_courses_df.SubjectName.isin(courses_to_remove)]
current_courses_df = current_courses_df[~current_courses_df.StudyDomain.isin(domains_to_remove)]
current_courses_df

### Most taken courses
We need to find a way to get a cleaner dataset of courses, a lot of them are not usefull or outdated and should not be recommended.

In [None]:
# Taking a look at the most taken courses
registrations_df = all_df.set_index(['SubjectName', 'YearName'])
all_df_registrations = registrations_df.groupby(['SubjectName', 'YearName']).size()

registrations_df['Registration'] = all_df_registrations
registrations_df = registrations_df.reset_index()
# Pick only courses that have a study domain (removes bullshit)
# such as Projects and groups, minors etc
registrations_df = registrations_df[~registrations_df.StudyDomain.isnull()]
# Remove the SHS courses
registrations_df = registrations_df[~(registrations_df.StudyDomain == "Programme Sciences humaines et sociales")]
# Removes non important information
registrations_df = registrations_df.drop([
    'PersonID', "StudyDomain", "SectionName", "PedagogicalCode",
    "CourseCode"], axis=1)
registrations_df = registrations_df.drop_duplicates()
registrations_df = registrations_df.set_index(['SubjectName', 'YearName']).sort_index()
registrations = registrations_df.sort_values(ascending=False, by='Registration')

# Latest data registrations
registrations.xs('2015-2016', level='YearName')

### Creating the binary matrix

In [None]:
courses_matrix = all_df[['PersonID', 'SubjectName']]
courses_matrix = courses_matrix.drop_duplicates()
courses_matrix = courses_matrix.set_index(['PersonID', 'SubjectName'])

def series_to_integers(series):
    "Converts a whole series to integers"
    return pd.to_numeric(series, downcast='integer')

# If the course was taken, set it to 1
courses_matrix['joined'] = 1
courses_matrix = courses_matrix.reset_index().pivot(index='PersonID', columns='SubjectName', values='joined')
courses_matrix = courses_matrix.fillna(0)
courses_matrix = courses_matrix.apply(series_to_integers)
courses_matrix

### Splitting test/train

In [None]:
# Total matrix, use it to split train and test
registrations_total = courses_matrix.unstack().reset_index()
registrations_total = registrations_total.rename(columns={0: "Taken"})
#test_set = registrations_total.sample(frac=0.2, replace=False)
#train_set = registrations_total - test_set
train, test = train_test_split(registrations_total, test_size=0.2)
print("We have {} rows in total".format(len(registrations_total)))
print("Train: {} rows".format(len(train)))
print("Test: {} rows".format(len(test)))

In [None]:
# Back to training matrix
courses_matrix = train.pivot(index='PersonID', columns='SubjectName', values="Taken")
courses_matrix = courses_matrix.fillna(0)
courses_matrix = courses_matrix.apply(series_to_integers)
courses_matrix

### Collaborative filtering through correlation matrix
We use the Jaccard score to compute the similarity matrix and then apply on the binary matrix to predict good courses to take. 

#### Collaborative filtering with Jaccard similarity

In [None]:
from scipy.stats import entropy
from numpy.linalg import norm
import numpy as np

def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

# Using the Jensen-Shannon similarity
corr_courses_matrix = pd.DataFrame(np.zeros((courses_matrix.T.shape[0], courses_matrix.T.shape[0])))
for i in range(courses_matrix.T.shape[0]):
    print("Computing similarity: {:.2f}%".format(100*i/courses_matrix.T.shape[0]), end="\r")
    for j in range(courses_matrix.T.shape[0]):
        corr_courses_matrix.iloc[i].iloc[j] = JSD(courses_matrix.T.iloc[i], courses_matrix.T.iloc[j])
        
corr_courses_matrix = 1 - corr_courses_matrix.replace([np.inf, -np.inf], 1)

print(corr_courses_matrix.shape)
print(corr_courses_matrix)

# Using Jaccard distance
#corr_courses_matrix = squareform(1 - pdist(courses_matrix.T, 'jaccard'))

# Using Pearson correlation
#corr_courses_matrix = np.corrcoef(courses_matrix.T)  

#corr_courses_matrix = squareform(pdist(courses_matrix.T, lambda x: JSD))

In [None]:
course_index = courses_matrix.columns

def get_course_similarity(course):  
    '''Returns correlation vector for a course'''
    course_idx = list(course_index).index(course)
    return corr_courses_matrix[course_idx]

def get_course_recommendations(user_courses):  
    '''Given a set of courses, it returns all the courses with their similarity score'''
    course_similarities = np.zeros(corr_courses_matrix.shape[0])
    for course_id in user_courses:
        course_similarities = course_similarities + get_course_similarity(course_id)
    similarities_df = pd.DataFrame({
        'course_title': course_index,
        'sum_similarity': course_similarities
        })
    similarities_df = similarities_df[similarities_df.course_title.isin(user_courses)]
    similarities_df = similarities_df.sort_values(by=['sum_similarity'], ascending=False)
    return similarities_df

def recommend_row(user_row):
    sample_user = 1801481982
    sample_user_courses = list(user_row.sort_values(ascending=False).index)
    recommendations = get_course_recommendations(sample_user_courses)
    
# Le'ts try it out for a random user
#sample_user = 1801481982
sample_user = 1892490156
sample_user_courses = list(courses_matrix.loc[sample_user].sort_values(ascending=False).index)
recommendations = get_course_recommendations(sample_user_courses)

row = courses_matrix.loc[sample_user]
user_courses_as_list = list(row[row > 0].index)
# We get the top 20 recommended courses
print("The user {} has the following courses: \n- {}\
      \nso we recommend him to pick:"
      .format(sample_user,
              "\n- ".join(user_courses_as_list)))
# Only accept recommendations from latest data courses (2015-2016)
accepted_recommendations = recommendations[recommendations.course_title.isin(current_courses_df.SubjectName)]
# Removing bloat courses and courses that the user took
accepted_recommendations = accepted_recommendations.dropna()
accepted_recommendations = accepted_recommendations[~accepted_recommendations.course_title.isin(user_courses_as_list)]

# Normalizing the results by dividing by the maximum of the summed similarities
accepted_recommendations.sum_similarity = accepted_recommendations.sum_similarity / accepted_recommendations.sum_similarity.max()
accepted_recommendations

# Code used to predict one course:
#favoured_course = 'Distributed information systems'
#favoured_course_index = list(courses_index).index(favoured_course)
#P = corr_courses_matrix[favoured_course_index]

# list the courses with a high correlation with the favoured course
#print(list(courses_index[(P>0.3) & (P<1.0)])  

#### Collaborative filtering with KNN

In [None]:
# Trying Surprise with KNN
from collections import defaultdict
from surprise.dataset import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, Dataset, evaluate, print_perf, accuracy

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def k_fold(data, algorithm):
    errors = []
    for trainset, testset in data.folds():
        # shut up while doing it
        from IPython.utils import io
        with io.capture_output() as captured:
            # train and test algorithm.
            algorithm.train(trainset)
        predictions = algorithm.test(testset)
        errors.append(accuracy.rmse(predictions, verbose=False))
    return errors

# Drop bullshit data
used_data = registrations_total

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 1))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(used_data[['PersonID', 'SubjectName', 'Taken']], reader)

In [None]:
import matplotlib.pyplot as plt 

# how to make it shut up ?
# test and train with verbose=False ?
def try_knn(KNN_algo, sim_options, data):
    print("training: 0.00%", end='\r')
    k_values = np.arange(10, 60)
    results = []
    for k in k_values:
        algo = KNN_algo(k=k, sim_options=sim_options)
        errors = k_fold(data, algo)
        results.append((k, np.mean(errors)))
        print("training: {:.2f}%".format((k + 1 - k_values[0]) / len(k_values) * 100), end='\r')
    print("\ndone.")
    return results

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True  # compute  similarities between items
               }

# K-fold to find the best k using KNNBaseline
knn_baseline_results = try_knn(KNNBaseline, sim_options, data)
print("RMSE for KNN Baseline:")
plt.plot(*zip(*knn_baseline_results))
plt.show()

best_rmse_tuple = sorted(knn_baseline_results, key=lambda x: x[1])[0]
print("The best RMSE is: {} for k = {}".format(best_rmse_tuple[1], best_rmse_tuple[0]))

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True  # compute  similarities between items
               }
# Best one yet, after doing a k-fold on all of the other models: k=22
algo = KNNBaseline(k=22, sim_options=sim_options)
trainset = data.build_full_trainset()
algo.train(trainset)
testset = trainset.build_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)

In [None]:
print("Taken courses: ", "\n- ".join(list(courses_matrix.loc[946926890][courses_matrix.loc[946926890] == 1].index)))
print("predictions: ", ['Pattern classification and machine learning', 'TCP/IP networking', 'Mobile networks', 'Mineur : Management, technologie et entrepreneuriat', 'Cryptography and security', np.nan, np.nan, np.nan, np.nan, np.nan])

#### Collaborative filtering with NMF

In [None]:
# Trying with NMF
from surprise import NMF
algo = NMF(biased=True, verbose=True)
errors = k_fold(data, algo)
np.mean(errors)

#### Collaborative filtering with SlopeOne

In [None]:
# Trying with SlopeOne
from surprise import SlopeOne
algo = SlopeOne()
errors = k_fold(data, algo)
np.mean(errors)

#### Collaborative filtering with CoClustering

In [None]:
# Trying with co-clustering
from surprise import CoClustering
algo = CoClustering(n_cltr_u=1, n_cltr_i=8, n_epochs=50, verbose=True)
errors = k_fold(data, algo)
np.mean(errors)

#### Collaborative filtering with Collaborative Denoising Auto-Encoders

In [None]:
from keras.layers import Input, Dense, Embedding, Flatten, Dropout, merge, Activation
from keras.models import Model
from keras.regularizers import l2

def create(I, U, K, hidden_activation, output_activation, q=0.5, l=0.01):
    '''
    create model
    Reference:
      Yao Wu, Christopher DuBois, Alice X. Zheng, Martin Ester.
        Collaborative Denoising Auto-Encoders for Top-N Recommender Systems.
          The 9th ACM International Conference on Web Search and Data Mining (WSDM'16), p153--162, 2016.

    :param I: number of items
    :param U: number of users
    :param K: number of units in hidden layer
    :param hidden_activation: activation function of hidden layer
    :param output_activation: activation function of output layer
    :param q: drop probability
    :param l: regularization parameter of L2 regularization
    :return: CDAE
    :rtype: keras.models.Model
    '''
    x_item = Input((I,), name='x_item')
    h_item = Dropout(q)(x_item)
    h_item = Dense(K, W_regularizer=l2(l), b_regularizer=l2(l))(h_item)

    # dtype should be int to connect to Embedding layer
    x_user = Input((1,), dtype='int32', name='x_user')
    h_user = Embedding(input_dim=U, output_dim=K, input_length=1, W_regularizer=l2(l))(x_user)
    h_user = Flatten()(h_user)

    h = merge([h_item, h_user], mode='sum')
    if hidden_activation:
        h = Activation(hidden_activation)(h)
    y = Dense(I, activation=output_activation)(h)

    return Model(input=[x_item, x_user], output=y)

def success_rate(pred, true):
    cnt = 0
    for i in range(pred.shape[0]):
        t = np.where(true[i] == 1) # true set
        ary = np.intersect1d(pred[i], t)
        if ary.size > 0:
            cnt += 1
    return cnt * 100 / pred.shape[0]

train_users = np.arange(courses_matrix.shape[0])
test_users = np.arange(courses_matrix.shape[0])

flatten_matrix = courses_matrix.unstack().reset_index()

test_x = flatten_matrix.sample(frac=0.2, replace=False).set_index(['SubjectName', 'PersonID'])
test_x = test_x.pivot_table(index='PersonID', columns="SubjectName").fillna(0)
train_x = courses_matrix - test_x
train_x = train_x.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()
test_x = test_x.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()


In [None]:
train_x_users = np.array(train_users, dtype=np.int32).reshape(len(train_users), 1)
test_x_users = np.array(test_users, dtype=np.int32).reshape(len(test_users), 1)

# model
model = create(I=train_x.shape[1], U=len(train_users)+1, K=50,
                    hidden_activation='relu', output_activation='sigmoid', q=0.50, l=0.01)
model.compile(loss='mean_absolute_error', optimizer='adam')
model.summary()

# train
history = model.fit(x=[train_x, train_x_users], y=train_x,
                    batch_size=128, nb_epoch=1000, verbose=1,
                    validation_data=[[test_x, test_x_users],
                    test_x])

In [None]:
# predict
pred = model.predict(x=[train_x, np.array(train_users, dtype=np.int32).reshape(len(train_users), 1)])
pred = pred * (train_x == 0) # remove watched items from predictions
pred = np.argsort(pred)

for n in range(1, 11):
    sr = success_rate(pred[:, -n:], test_x)
    print("Success Rate at {:d}: {:f}".format(n, sr))

Remove students not pairs, and test on them after training, remove k courses, see which ones pop up. Test on other domains. Try with all data. Compute f1. Plot precision and recall. Papers boi faltings on top k recommendations. Co enrollment matrix, weight probabilities of output by the student's chance of taking a course (obligatory). 

In [None]:
flatten = courses_matrix.unstack().reset_index()
flatten = flatten.rename(columns={0:"Taken"})
random_user = flatten[(flatten.Taken == 1) & (flatten.PersonID == 974912207)]
random_user_id = courses_matrix.index.get_loc(key=974912207)
random_user_predictions = pred[random_user_id, -15:]
predicted_courses = [ courses_matrix.columns[i] for i in random_user_predictions ]

# Remove courses that were not given in the last year
last_year_courses = list(registrations.xs('2015-2016', level='YearName').index)
predicted_courses = [c for c in predicted_courses if c in last_year_courses]

print("The random user picked the following courses: \n{} \nHence we propose the following: {}"
     .format(random_user.SubjectName, predicted_courses[::-1] ))

In [None]:
# Trying the Netflixprize solutions

In [None]:
# Find other ones

In [None]:
# Feature Engineering