# Exploration done on the EPFL recommender system

# Data Retrieval

In [None]:
import configparser
import matplotlib.pyplot as plt
import mysql.connector as sql
import numpy as np
import pandas as pd
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

# Read the confidential token.
credentials = configparser.ConfigParser()
credentials.read('credentials.ini')
db_connection = sql.connect(host=credentials.get('mysql', 'url'),
                            database='semester_project_romain',
                            user=credentials.get('mysql', 'username'),
                            password=credentials.get('mysql', 'password'))

In [None]:
# Found courses that should be removed:
# Génie nucléaire, Ingenierie financiere:
# useless, most courses compulsatory
courses_to_remove = [
    "Admission année sup.",
    "Projet de master en systèmes de communication",
    "SHS : Introduction au projet",
    "Cycle master",
    "Projet de Master",
    "Groupe Core courses & options",
    "Bloc Projets et SHS",
    "Groupe 2 : Options",
    "Master SC",
    "Mineur",
    "Groupe 1",
    "Projet en systèmes de communication II",
    "Projet en informatique II",
    "Projet de master en informatique",
    "Cours réservés spécifiquement aux étudiants s'inscrivant pour le mineur Area and Cultural Studies",
    "SHS : Projet",
    "Optional project in communication systems",
    "Optional project in computer science",
    "Mineur : Neurosciences computationnelles",
    "Stage d'ingénieur crédité avec le PDM (master en Systèmes de communication)",
    "Stage d'ingénieur crédité avec le PDM (master en Informatique)",
    "Cours UNIL - Faculté des hautes études commerciales HEC I (printemps)",
    "Chemical engineering of heterogenous reactions",
    "Process development I",
    "Chemical engineering lab & project",
    "Stage d'ingénieur (master en Génie chimique et Biotechnologie)",
    "Projet de master en génie chimique et biotechnologie",
    "Interdisciplinary project",
    "Projet de master en chimie moléculaire et biologique",
    "Project in molecular sciences",
    "Superstudio",
    "Enoncé théorique de master",
    "De la structure à l'ornement",
    "Projet de master en architecture",
    "Pré-étude projet de master",
    "Projet SIE/ENAC",
    "Projet de master en sciences et ingénierie de l'environnement",
    "Stage d'ingénieur crédité avec le PDM (master en Sciences et ingénierie de l'environnement)",
    "Projet de master en génie électrique et électronique",
    "Projet Génie mécanique II",
    "Projet Génie mécanique I",
    "Stage d'ingénieur crédité avec le PDM (master en Génie mécanique)",
    "Projet de master en génie mécanique",
    "Research project in materials I",
    "Projet de master en science et génie des matériaux",
    "Stage d'ingénieur crédité avec le PDM (master en Science et génie des matériaux)",
    "Projet microtechnique I",
    "Projet de master en microtechnique",
    "Stage d'ingénieur crédité avec le PDM (master en Microtechnique)",
    "Projet de master en mathématiques",
    "Projet de Mathématiques (master)",
    "Stage d'ingénieur (master en Ingénierie mathématique)",
    "Projet de master en mathématiques",
    "Projet de Mathématiques (master)",
    "Stage d'ingénieur crédité avec le PDM (master en Ingénierie mathématique)",
    "Stage d'ingénieur (master en Bioingénierie)",
    "Projet de master en bioingénierie et biotechnologie",
    "Stage d'ingénieur (master en Sciences et technologie du vivant)",
    "Projet de master en sciences et technologies du vivant",
    "Stage d'ingénieur (master en Génie nucléaire)",
    "Projet de master en génie nucléaire",
    "Stage d'ingénieur (master en Ingénierie physique)",
    "Projet de master en physique",
    "Stage d'ingénieur (master en Sciences et ingénierie computationnelles)",
    "Projet de master en science et ingénierie computationelles",
    "Projet CSE I",
    "Projet CSE II",
    "Project in energy management and sustainability I",
    "Stage d'ingénieur crédité avec le PDM (master en Gestion de l'énergie et construction durable)",
    "Stage d'ingénieur (master en Génie électrique et électronique)",
]

domains_to_remove = [
    "Humanities and social sciences",
    "Programme Sciences humaines et sociales",
]

In [None]:
# PlanType = "PLAN_EXAMINE" ?????
all_info = """
            select distinct 
                PersonID, 
                PedagogicalCode, 
                StudyDomain, 
                UnitName, 
                SubjectName, 
                Course_Enrolments.SubjectID,
                SectionName, 
                CourseCode,
                YearName
            from 
                Course_Enrolments
                inner join 
                Course_Codes 
                    on Course_Codes.planid = course_enrolments.planid 
                    and Course_Codes.subjectid = course_enrolments.subjectid
            where 
                LevelName = "Master"
                and IsStudent = 1
                and IsEnrolled = 1
                and (YearName = "2010-2011"
                or YearName = "2011-2012"
                or YearName = "2012-2013"
                or YearName = "2013-2014"
                or YearName = "2014-2015"
                or YearName = "2015-2016")
                
            """
#all_df = pd.read_sql(all_info, con=db_connection)
#all_df = all_df[~all_df.SubjectName.isin(courses_to_remove)]
# Removing the SHS courses
#all_df = all_df[~(all_df.StudyDomain.isin(domains_to_remove))]
#print(all_df.UnitName.unique())
#all_df

In [None]:
# PlanType = "PLAN_EXAMINE" ?????
#unit_name = '(UnitName like "%nform%" or UnitName like "%omm%")'
units = [
    "Génie chimique et biotechnologie",
    "Chimie moléculaire et biologique",
    "Informatique",
    "Architecture",
    "Génie civil",
    "Sciences et ingénierie de l'environnement",
    "Génie électrique et électronique",
    "Génie mécanique",
    "Science et génie des matériaux",
    "Microtechnique",
    "Systèmes de communication - master",
    "Mathématiques - master",
    "Ingénierie mathématique",
    "Bioingénierie",
    "Sciences et technologies du vivant - master",
    "Micro and Nanotechnologies for Integrated Systems",
    "Génie nucléaire",
    "Ingénierie financière",
    "Ingénierie physique",
    "Physique - master",
    "Science et ingénierie computationnelles",
    "Gestion de l'énergie et construction durable",
    "Management, technologie et entrepreneuriat",
]

all_info = """
            select distinct 
                PersonID, 
                PedagogicalCode, 
                StudyDomain, 
                UnitName, 
                UnitID, 
                UnitCode,
                SubjectName, 
                Course_Enrolments.SubjectID,
                SectionName, 
                CourseCode,
                YearName
            from 
                Course_Enrolments
                inner join 
                Course_Codes 
                    on Course_Codes.planid = course_enrolments.planid 
                    and Course_Codes.subjectid = course_enrolments.subjectid
            where 
                {}
                and LevelName = "Master"
            """.format("UnitName = \"{}\"".format(units[22]))

all_df = pd.read_sql(all_info, con=db_connection)
all_df = all_df[~all_df.SubjectName.isin(courses_to_remove)]
# Removing the SHS courses
all_df = all_df[~(all_df.StudyDomain.isin(domains_to_remove))]
# Mapping of subject ids to subject names
subject_mapping = all_df[['SubjectID', 'SubjectName']].drop_duplicates()
all_df

In [None]:
current_courses = """
            select distinct 
                PedagogicalCode, 
                SubjectName, 
                SubjectID,
                StudyDomain,
                YearName
            from 
                Course_Enrolments 
            where 
                UnitName like "%ommunication%" 
                and 
                LevelName = "Master"
                and left(PedagogicalCode, 2) = "MA"
                and YearName = "2015-2016"
            """
#current_courses_df = pd.read_sql(current_courses, con=db_connection)
# These are the current courses (latest data) given in syscom @EPFL
#current_courses_df = current_courses_df[~current_courses_df.SubjectName.isin(courses_to_remove)]
#current_courses_df = current_courses_df[~current_courses_df.StudyDomain.isin(domains_to_remove)]
#current_courses_df

## Creating the binary matrix

In [None]:
courses_matrix = all_df[['PersonID', 'SubjectName']]
courses_matrix = courses_matrix.drop_duplicates()
courses_matrix = courses_matrix.set_index(['PersonID', 'SubjectName'])

def series_to_integers(series):
    "Converts a whole series to integers"
    return pd.to_numeric(series, downcast='integer')

# If the course was taken, set it to 1
courses_matrix['joined'] = 1
courses_matrix = courses_matrix.reset_index().pivot(index='PersonID', columns='SubjectName', values='joined')
courses_matrix = courses_matrix.fillna(0)
courses_matrix = courses_matrix.apply(series_to_integers)

# Removing all students that took less than five courses
MIN_COURSES_BY_STUDENT = 10
courses_matrix =courses_matrix[np.sum(courses_matrix == 1, axis=1) > MIN_COURSES_BY_STUDENT]
courses_matrix

# Co-enrolment matrix
## Most taken courses
We need to find a way to get a cleaner dataset of courses, a lot of them are not usefull or outdated and should not be recommended.

In [None]:
# Taking a look at the most taken courses
registrations_df = all_df.set_index(['SubjectName', 'YearName'])
all_df_registrations = registrations_df.groupby(['SubjectName', 'YearName']).size()

registrations_df['Registration'] = all_df_registrations
registrations_df = registrations_df.reset_index()
# Pick only courses that have a study domain (removes bullshit)
# such as Projects and groups, minors etc
registrations_df = registrations_df[~registrations_df.StudyDomain.isnull()]
# Remove the SHS courses
registrations_df = registrations_df[~(registrations_df.StudyDomain == "Programme Sciences humaines et sociales")]
# Removes non important information
registrations_df = registrations_df.drop([
    'PersonID', "StudyDomain", "SectionName", "PedagogicalCode",
    "CourseCode"], axis=1)
registrations_df = registrations_df.drop_duplicates()
registrations_df = registrations_df.set_index(['SubjectName', 'YearName']).sort_index()
registrations = registrations_df.sort_values(ascending=False, by='Registration')

# Latest data registrations
registrations.xs('2015-2016', level='YearName')

## Enrolments

In [None]:
co_enrolments = pd.DataFrame(data=0, columns=courses_matrix.columns, index=courses_matrix.columns)
for row in courses_matrix.iterrows():
    taken_courses = row[1][row[1] == 1].index.tolist()
    for i,course in enumerate(taken_courses):
        co_enrolments.loc[course, taken_courses[i+1:]] += 1
    

# Copy the upper triangle matrix to lower triangle one
co_enrolments = co_enrolments + co_enrolments.T

# Transforming to probabilities and removing the rows summing to nan
co_enrolments = co_enrolments / co_enrolments.sum(axis=0)

def get_coenrolment(course, other_enrolments):
    return co_enrolments.loc[course, other_enrolments].mean()

def training_weight_coenrolments(user_index):
    courses_taken = courses_matrix.iloc[user_index][courses_matrix.iloc[user_index] == 1].index.tolist()
    return [ get_coenrolment(c, courses_taken) for c in courses_matrix.columns.tolist() ]

## Grades correlations inbetween courses

In [None]:
def course_id_mapper(sub_id):
    mapped = subject_mapping[subject_mapping.SubjectID == sub_id].SubjectName
    return mapped.values[0] if not mapped.empty else np.nan

# If one of the correlations is non-existent, return the other
def correlation_series_mean(f_corr, s_corr):
    if f_corr == -5 and s_corr == -5: raise Exception("both correlations non-existent")
    if f_corr == -5 or s_corr == -5: return max(f_corr, s_corr)
    return np.mean([f_corr, s_corr])
    
# Retrieve courses correlations
grade_corr = pd.read_csv('data/correlation-subject-pair.csv')
grade_corr = grade_corr[['sub1', 'sub2', "cor1", "cor2"]]
grade_corr['cor_mean'] = grade_corr[['cor1', 'cor2']].apply(lambda x: correlation_series_mean(x[0],x[1]), axis=1)
grade_corr = grade_corr[['sub1', 'sub2', 'cor_mean']]

# Use SubjectName instead of SubjectID
grade_corr['sub1_name'] = grade_corr.sub1.map(course_id_mapper)
grade_corr['sub2_name'] = grade_corr.sub2.map(course_id_mapper)
grade_corr = grade_corr.dropna()[['sub1_name', 'sub2_name', 'cor_mean']]

# In case there are no correlations, we set to the mean of all of them
mean_correlations = grade_corr.mean()

# Let's make it a matrix
grade_corr_matrix = grade_corr.set_index(["sub1_name", "sub2_name"]).unstack(level=0).fillna(mean_correlations)
# normalize correlations by adding 1 and dividing by the max
grade_corr_matrix = (grade_corr_matrix + 1)/2

# Set not found courses correlations to the mean of all correlations
no_corr_courses = [ c for c in courses_matrix.columns.tolist() if c not in grade_corr_matrix.index.tolist() ]
missing_correlations = pd.DataFrame(np.full(fill_value=mean_correlations, 
                                            shape=(grade_corr_matrix.shape[0], len(no_corr_courses))), 
                                    columns=no_corr_courses, 
                                    index=grade_corr_matrix.index.tolist())
grade_corr_matrix.columns = grade_corr_matrix.columns.droplevel()
grade_corr_matrix = pd.concat([grade_corr_matrix, missing_correlations], axis=1)

# Let's transform it into probabilistic
grade_corr_matrix = grade_corr_matrix / grade_corr_matrix.sum(axis=0)

def get_grades_corr(course, other_enrolments):
    if course not in grade_corr_matrix.index.tolist():
        return 1/grade_corr_matrix.shape[1]
    return grade_corr_matrix.loc[course, other_enrolments].mean()

def training_weight_grade_corr(user_index):
    courses_taken = courses_matrix.iloc[user_index][courses_matrix.iloc[user_index] == 1].index.tolist()
    return [ get_grades_corr(c, courses_taken) for c in courses_matrix.columns.tolist() ]

# The final dataframe of courses correlations
grade_corr_matrix

### Splitting test/train

In [None]:
def split_data(data):
    testing_set = data.applymap(lambda x: 0)

    taken_courses_flat = data.stack().to_frame()
    taken_courses_flat = taken_courses_flat[taken_courses_flat[0] == 1]

    for student in taken_courses_flat.index.get_level_values('PersonID').unique():
        courses = taken_courses_flat.loc[student]
        for course in courses.sample(frac=0.2, replace=False).index:
            testing_set.loc[student, course] = 1
    training_set = data - testing_set

    return training_set, testing_set

training_set, testing_set = split_data(courses_matrix)

# Numpify the data
train_np = training_set.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()
test_np = testing_set.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()

# the indices of each user
users = np.array(np.arange(courses_matrix.shape[0])[np.newaxis].T, dtype=np.int32)

### Metrics

In [None]:
# Precision is the percentage of recommended items that are "good ones"
# Hence, the matched prediction divided by N of Top-N
def precision(y_true, y_pred):
    """
    Takes predictions as an np array of indices,
    true ratings as a numpy array,
    returns precision
    """
    precisions = []
    for i,user in enumerate(y_pred):
        nb_right_pred = sum(y_true[i, user] == 1)
        precisions.append(nb_right_pred / y_pred.shape[1])
    return np.mean(precisions)
    
#Recall is the percentage of good ones that are recommended.
def recall(y_true, y_pred):
    """
    Takes predictions as an np array of indices,
    true ratings as a numpy array,
    returns recall
    """
    recall = []
    for i,user in enumerate(y_pred):
        nb_right_pred = sum(y_true[i, user] == 1)
        recall.append(nb_right_pred / y_true[i].sum())
    return np.mean(recall)
    
def f1_score(y_true, y_pred):
    """
    Takes predictions as an np array of indices,
    true ratings as a numpy array,
    returns f1 score
    """
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    f1 = 2 * (prec * rec) / (prec + rec)
    return f1

def mean_average_precision(y_true, y_pred):
    """
    Takes predictions as an np array of indices,
    true ratings as a numpy array,
    returns the mean average precision
    """
    N = y_pred.shape[1]
    average_prec = []
    for i, user in enumerate(y_pred):
        summed_prec = 0
        for k in np.arange(N):
            prec_k = sum(y_true[i, user] == 1) / N
            rel_k = y_true[i, user[-1]]
            summed_prec += (prec_k * rel_k)
        average_prec.append(summed_prec / min(N, y_true[i].sum()))
    
    return np.mean(average_prec)

#### Collaborative filtering with Collaborative Denoising Auto-Encoders

In [None]:
from keras.layers import Input, Dense, Embedding, Flatten, Dropout, Activation
from keras.layers.merge import Add
from keras.models import Model
from keras.regularizers import l2

def create(I, U, K, hidden_activation, output_activation, q=0.5, l=0.01):
    '''
    create model
    Reference:
      Yao Wu, Christopher DuBois, Alice X. Zheng, Martin Ester.
        Collaborative Denoising Auto-Encoders for Top-N Recommender Systems.
          The 9th ACM International Conference on Web Search and Data Mining (WSDM'16), p153--162, 2016.

    :param I: number of items
    :param U: number of users
    :param K: number of units in hidden layer
    :param hidden_activation: activation function of hidden layer
    :param output_activation: activation function of output layer
    :param q: drop probability
    :param l: regularization parameter of L2 regularization
    :return: CDAE
    :rtype: keras.models.Model
    '''
    x_item = Input((I,), name='x_item')
    h_item = Dropout(q)(x_item)
    h_item = Dense(K, kernel_regularizer=l2(l), bias_regularizer=l2(l))(h_item)

    # dtype should be int to connect to Embedding layer
    x_user = Input((1,), dtype='int32', name='x_user')
    h_user = Embedding(input_dim=U, output_dim=K, input_length=1, embeddings_regularizer=l2(l))(x_user)
    h_user = Flatten()(h_user)

#    h = merge([h_item, h_user], mode='sum')
    h = Add()([h_item, h_user])
    if hidden_activation:
        h = Activation(hidden_activation)(h)
    y = Dense(I, activation=output_activation)(h)

    return Model(inputs=[x_item, x_user], outputs=y)

def success_rate(true, pred):
    """
    The success rate is defined as the percentage of chances that we pick
    one of the recommendations.
    """
    cnt = 0
    for i in range(pred.shape[0]):
        t = np.where(true[i] == 1) # true set
        ary = np.intersect1d(pred[i], t)
        if ary.size > 0:
            cnt += 1
    return cnt * 100 / pred.shape[0]

In [None]:
q_test = 0.998
k_test = 27

# model
# Q was 0.50, now 0.998 ?
model = create(I=train_np.shape[1], U=len(users)+1, K=k_test,
                    hidden_activation='relu', output_activation='sigmoid', q=q_test, l=0.01)
model.compile(loss='mean_absolute_error', optimizer='adam') 
#model.compile(loss='binary_crossentropy', optimizer='adam') 

#    model.summary()

# train
history = model.fit(x=[train_np, users], y=train_np,
                    batch_size=128, epochs=2000, verbose=2,
                    #validation_data=[[test_np, users],
                    #test_np])
                    validation_split=0.20)

pred = model.predict(x=[test_np, users])
pred = pred * (train_np == 0) # remove watched items from predictions

In [None]:
def print_stats_prediction(pred):
    sorted_predictions = np.argsort(pred)

    mean_aps = []
    precisions, recalls, f1s = [], [], []

    # Take the N best recommendations
    N = 20
    print("For k =", k_test, ":")
    print("For q =", q_test, ":")
    print("\tdifference of success rates at 10 - 1:", success_rate(test_np, sorted_predictions[:,-10:]) - success_rate(test_np, sorted_predictions[:,-1:]))
    for n in range(1, N + 1):
        prediction_at_n = sorted_predictions[:, -n:]
        mean_ap = mean_average_precision(test_np, prediction_at_n)
        mean_aps.append(mean_ap)
        prec = precision(test_np, prediction_at_n)
        precisions.append(prec)
        rec = recall(test_np, prediction_at_n)
        recalls.append(rec)
        f1 = f1_score(test_np, prediction_at_n)
        f1s.append(f1)
        sr = success_rate(test_np, prediction_at_n)
        print("\n\tSuccess Rate at {:d}: {:f}".format(n, sr))
        print("\tMAP at {}: {}".format(n, mean_ap))
        print("\tPrecision at {}: {}".format(n, prec))
        print("\tRecall at {}: {}".format(n, rec))
        print("\tF1 score at {}: {}".format(n, f1))
        
    # Plotting the MAP at k
    map_df = pd.DataFrame(mean_aps)
    map_df.index.name = 'K'
    map_df.rename(columns={0:'Mean average precision'}, inplace=True)
    map_df = map_df.reset_index()
    map_df['K'] = map_df['K'] + 1

    col_pal = sns.cubehelix_palette(N, reverse=True)
    sns.barplot(x="K", y="Mean average precision", data=map_df, palette=col_pal)
    plt.title("Mean average precision at k for top-k courses recommendations")
    plt.show()


    prec_rec_df = pd.DataFrame(data={"Precision":precisions, "Recall":recalls})
    plt.plot(recalls, precisions, color='b', alpha=0.2)
    plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve')
    plt.show()

## Prediction

In [None]:
# Baseline + co-enrolment
preds = np.array([ np.array(training_weight_coenrolments(i)) * np.array(nn_weights) for i, nn_weights in enumerate(pred) ])
print_stats_prediction(preds)

# Baseline + grade correlation + co-enrolment
preds = np.array([ np.array(training_weight_coenrolments(i)) * np.array(training_weight_grade_corr(i)) * np.array(nn_weights) for i, nn_weights in enumerate(pred) ])
print_stats_prediction(preds)

# Only baseline
#print_stats_prediction(pred)

# Only co-enrolment
#preds = np.array([ np.array(training_weight_coenrolments(i)) for i, nn_weights in enumerate(pred) ])
#print_stats_prediction(preds)

# Only grade correlations
#preds = np.array([ np.array(training_weight_grade_corr(i)) for i, nn_weights in enumerate(pred) ])
#print_stats_prediction(preds)

# Baseline + grade correlations
#preds = np.array([ np.array(training_weight_grade_corr(i)) * np.array(nn_weights) for i, nn_weights in enumerate(pred) ])
#print_stats_prediction(preds)

# Grade correlations + co-enrolment
#preds = np.array([ np.array(training_weight_coenrolments(i)) * np.array(training_weight_grade_corr(i)) for i, nn_weights in enumerate(pred) ])
#print_stats_prediction(preds)

### Splitting into group1, group2 courses

In [None]:
group1_courses = ["Advanced algorithms", "Advanced computer architecture",
                 "Cryptography and security", "Advanced databases",
                 "Distributed algorithms", "Distributed information systems",
                 "Foundations of software", "Information theory and coding",
                 "Pattern classification and machine learning"]
MIN_COURSES_BY_STUDENT_group1 = 2
MIN_COURSES_BY_STUDENT_group2 = 3

group1_courses_matrix = courses_matrix[group1_courses]
group1_courses_matrix = group1_courses_matrix[np.sum(group1_courses_matrix == 1, axis=1) > MIN_COURSES_BY_STUDENT_group1]
group2_courses_matrix = courses_matrix.drop(group1_courses, axis=1)
group2_courses_matrix = group2_courses_matrix[np.sum(group2_courses_matrix == 1, axis=1) > MIN_COURSES_BY_STUDENT_group2]

training_set_group1, testing_set_group1 = split_data(group1_courses_matrix)
training_set_group2, testing_set_group2 = split_data(group2_courses_matrix)

# Numpify the data
train_np_1 = training_set_group1.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()
test_np_1 = testing_set_group1.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()
train_np_2 = training_set_group2.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()
test_np_2 = testing_set_group2.apply(axis=1, func=lambda x: x.astype(int)).as_matrix()

# the indices of each user
users_group1 = np.array(np.arange(group1_courses_matrix.shape[0])[np.newaxis].T, dtype=np.int32)
users_group2 = np.array(np.arange(group2_courses_matrix.shape[0])[np.newaxis].T, dtype=np.int32)

### Prediction for a student

In [None]:
me = [ 
"Distributed information systems",
"Information theory and coding",
"Pattern classification and machine learning",
"Mobile networks",
"Statistical signal and data processing through applications",
"TCP/IP networking",
"Digital education & learning analytics"]
my_courses = pd.DataFrame(data=0, columns=courses_matrix.columns, index=["Romain"])
my_courses[me] = 1
taken_courses = my_courses.loc["Romain"][my_courses.loc["Romain"] == 1].index.tolist()

my_binary_courses = my_courses.as_matrix()
binary_courses_format = np.array([[1]], dtype=np.int32)

In [None]:
prediction_romain = model.predict(x=[my_binary_courses, binary_courses_format])
#preds_romain = np.array([ np.array(training_weight_coenrolments(i)) * np.array(training_weight_grade_corr(i)) * np.array(nn_weights) for i, nn_weights in enumerate(pred) ])
#print_stats_prediction(preds)
prediction_romain = np.argsort(prediction_romain)

predicted_courses = [courses_matrix.columns[i] for i in prediction_romain[0]]
last_year_courses = list(registrations.xs('2015-2016', level='YearName').index)
predicted_courses = [c for c in predicted_courses if c in last_year_courses and c not in taken_courses]

print("I picked the following courses: \n\t-{} \n\nHence we propose the following: \n\t-{}"
     .format("\n\t-".join(taken_courses), "\n\t-".join(predicted_courses[::-1][:10])))

### Done
- All data gives really bad results (discrimination by section)
- Results are different from one domain to another (BEFORE WEIGHING OUT THE OBLIGATORY COURSES, or courses that are bloat (laboratories etc..., case by case basis ?)
- [They talk a bit about top-N recommendation in this paper (Boi)](http://delivery.acm.org/10.1145/2810000/2800184/p179-maksai.pdf?ip=128.179.189.64&id=2800184&acc=ACTIVE%20SERVICE&key=FC66C24E42F07228%2E7E17DDD1CCA0F75B%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&CFID=1003867347&CFTOKEN=59100520&__acm__=1510214008_777b3b2b2f3763bb6c7aa57bcdcdf49a)
- Move to the new data
- Right precision and recall metrics
- Right test/train se
- Try on like 5 sections with same parameters if still good results
- Doing it by faculty gives same results (for IC)
- Split options and obligatory recommendations
- Quick hard-coded demo for taken courses
- Co-enrolment matrix
- Multiply probability of taking course at output with proba that a student takes the predicted course before argsort (mean of coenrolment inbetween one prediction and ALL courses the student took). Then compare baseline with new system.
- Correlation grades used by multiply to each pair of course, and put it in the pipeline. Then compare baseline with this one
- Try by faculty (all courses from ENAC for example, IC is not, and all good)
- Get results by faculties, wait all results to do model selection

### Questions

### To do
- Choose model based on f1 score probably
- Differentiate inbetween metrics
- Success rate inbetween taking one course and 2 courses
- Co-enrolment matrix is similar to collab filtering -> map the three to one class of existing algos
- Porting code to usable codebase for next coder ?
- Replace names by ids from DB by production code day
- Automatic best parameters detection ? Need a cluster ? Grid Search ?
- start working on demo, boxes for each course and recommend stuff