In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import matplotlib.pyplot as plt
import seaborn as sns

# Surpress any warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import string


# Read csvs
rating_df = pd.read_csv('ratings.csv')
user_emb = pd.read_csv('user_embeddings.csv')
item_emb = pd.read_csv('course_embeddings.csv')

In [None]:
# Merge dataframes to substitute user and items from ids with their embedding vectors
merged_df = pd.merge(rating_df, user_emb, how='left', on='user').fillna(0)
merged_df = pd.merge(merged_df, item_emb, how='left', on='item').fillna(0)
# Element-wise add user features (column labels starting with "UFeature") and item features (CFeature)
u_features = [f"UFeature{i}" for i in range(16)]
c_features = [f"CFeature{i}" for i in range(16)]
user_embeddings = merged_df[u_features]
course_embeddings = merged_df[c_features]
ratings = merged_df['rating']

In [None]:
# Aggregate the two feature columns using element-wise add
interaction_dataset = user_embeddings + course_embeddings.values
interaction_dataset.columns = [f"Feature{i}" for i in range(16)]
interaction_dataset['rating'] = ratings

In [None]:
# Use LabelEncoder to encode rating into categorical
X = interaction_dataset.iloc[:, :-1] 
y_raw = interaction_dataset.iloc[:, -1] 
# Transform column into cat
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw.values.ravel())

In [None]:
def build_model(input_shape, layers, units):
  # Define the model architecture
    model = keras.Sequential()
    for layer in range(layers):
        model.add(keras.layers.Dense(units, input_shape=(input_shape,), activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))

  # Compile the model with an Adam optimizer, a binary cross-entropy loss function, and the F1 metric
    model.compile(optimizer='adam', loss='binary_crossentropy')

    return model

In [None]:
# Create a KerasClassifier object that wraps the model
model_wrapper = KerasClassifier(build_fn=build_model, input_shape=X.shape[1])
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Create a KerasClassifier object that wraps the model
model_wrapper = KerasClassifier(build_fn=build_model, input_shape=X.shape[1])

In [None]:
# Define a parameter grid for the number of layers and units
param_grid = {
    'layers': [1, 2, 3],
    'units': [32, 64, 128]
}

In [None]:
# Create a GridSearchCV object to search over the parameter grid and define the 3 models

# Model 1
grid_search_1 = GridSearchCV(estimator=model_wrapper, param_grid=param_grid, cv=3, scoring='f1_micro')

# Model 2
grid_search_2 = GridSearchCV(estimator=model_wrapper, param_grid=param_grid, cv=3, scoring='f1_micro')

# Model 3
grid_search_3 = GridSearchCV(estimator=model_wrapper, param_grid=param_grid, cv=3, scoring='f1_micro')


In [None]:
# Model 1

# Fit the model to the training data
grid_search_1.fit(X_train, y_train, epochs = 20)

# best parameters and score
gs_best_param = grid_search_1.best_params_
gs_best_score = grid_search_1.best_score_

# Get the best model from the grid search
best_model = grid_search_1.best_estimator_

# Evaluate the best model on the test set using F1 score
y_pred = best_model.predict(X_test)
f1_model_1 = f1_score(y_test, y_pred)

In [None]:
# Model 2

# Fit the model to the training data
grid_search_2.fit(X_train, y_train, epochs = 50)

# best parameters and score
gs_best_param = grid_search_2.best_params_
gs_best_score = grid_search_2.best_score_

# Get the best model from the grid search
best_model = grid_search_2.best_estimator_

# Evaluate the best model on the test set using F1 score
y_pred = best_model.predict(X_test)
f1_model_2 = f1_score(y_test, y_pred)

In [None]:
# Model 3

# Fit the model to the training data
grid_search_3.fit(X_train, y_train, epochs = 100)

# best parameters and score
gs_best_param = grid_search_3.best_params_
gs_best_score = grid_search_3.best_score_

# Get the best model from the grid search
best_model = grid_search_3.best_estimator_

# Evaluate the best model on the test set using F1 score
y_pred = best_model.predict(X_test)
f1_model_3 = f1_score(y_test, y_pred)

In [None]:
f1_values = [f1_model_1, f1_model_2, f1_model_3]

maximum = max(f1_values)
position = f1_values.index(maximum)
minimum = min(f1_values)
print(maximum)
print(minimum)
print(f1_values[position])

In [None]:
f1_names = ["f1_model_1", "f1_model_2", "f1_model_3"]
# create a dataframe with the values and names
df_f1 = pd.DataFrame({'f1_score (higher the better)': f1_values, 'f1_model': f1_names})
df_f1

In [None]:
# use seaborn to plot the bar chart
sns.barplot(x='f1_model', y='f1_score (higher the better)', data=df_f1)
plt.ylim(minimum, maximum)
plt.title('f1_values')
plt.plot()