In [32]:
import stellargraph as sg

In [4]:
print(stellargraph.__version__)

1.2.1


In [204]:
import numpy as np
import pandas as pd
import scipy
import matplotlib as plt
import networkx as nx
import itertools
import collections
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.layer import GraphSAGE
from stellargraph.layer import HinSAGE
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.mapper import GraphSAGENodeGenerator, FullBatchNodeGenerator, DirectedGraphSAGENodeGenerator
from stellargraph.layer import GCN
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
import numpy as np

from tensorflow.keras.layers import Dense, Dot, Reshape, Input, Embedding, Lambda, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Final Project: Compare Graph Convolutional Network (GCN) and Sci-kit Learn's Surpise Library for Movie Recommendation

This project will compare the hand crafted recsys classes developed in Module 3 against the GCN model built from the StellarGraph library and the Sci-kit learn Surprise library. The dataset used is the MovieLens 100k dataset.

The motivation for this project is that despite best efforts, RMSE my Module 3 modules was still too high and I wanted to see if more effective models could be built. Part 1 will walk through the GCN model and Part 2 will walk through the Surprise model.

# Part 1: GCN Model

### Data Preparation

In [None]:
def convert_categorical(df_X, _X):
    values = np.array(df_X[_X])
    
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values) # integer encode
    
    onehot_encoder = OneHotEncoder(sparse=False) # binary encode
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    df_X = df_X.drop(_X, 1)
    for j in range(integer_encoded.max() + 1):
        df_X.insert(loc=j + 1, column=str(_X) + str(j + 1), value=onehot_encoded[:, j])
    return df_X

In [None]:
#This new file below was sourced from https://github.com/hadoov/GHRS/blob/main/datasets/ml-100k/ua.base - links users, movies and ratings
df_ua = pd.read_csv('data/ua.base', sep='\t', engine='python', names=['user_id', 'movie_id', 'rating', 'timestamp']).rename(columns={'user_id': 'uID', 'movie_id': 'mID'})

I want to acknowledge this example of preparing MovieLens data for GCN modeling: https://github.com/hadoov/GHRS/blob/main/datasets/ml-100k/ua.base. I used this as a starting point.

In [None]:
MV_users = pd.read_csv('data/users.csv').rename(columns={'accupation': 'job'})
MV_movies = pd.read_csv('data/movies.csv').drop(columns = ['title', 'year']).sort_values(by='mID')

#Not using these files
#test = pd.read_csv('data/test.csv')
#train = pd.read_csv('data/train.csv')

#from collections import namedtuple
#Data = namedtuple('Data', ['users','movies','train','test'])
#data = Data(MV_users, MV_movies, train, test)

In [None]:
MV_users = convert_categorical(MV_users, 'job')
MV_users = convert_categorical(MV_users, 'gender')
MV_users['bin'] = pd.cut(MV_users['age'], [0, 10, 20, 30, 40, 50, 100], labels=['1', '2', '3', '4', '5', '6'])
MV_users['age'] = MV_users['bin']

MV_users = MV_users.drop('bin', 1)
MV_users = convert_categorical(MV_users, 'age')
MV_users = MV_users.drop('zip', 1)

MV_users.set_index('uID', inplace=True)
MV_movies.set_index('mID', inplace=True)

# Adding a prefix to user and movie IDs
MV_users.index = 'u' + MV_users.index.astype(str)
MV_movies.index = 'm' + MV_movies.index.astype(str)

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  
  # Remove the CWD from sys.path while we load stuff.
  


The intent of this data preparation is to ready the movie data for processing into a graph model with nodes and edges. User id and movie id were made into indexes and prefixed with 'u' and 'm' respectively. The user data was one-hot encoded for the better processing of the data. The movie data was also one-hot encoded for the better processing of the data.

### Graph Model Building

In [None]:
# Create node data dictionary
node_data = {
    'user': MV_users,
    'movie': MV_movies
}

# Create edges with ratings
edges = df_ua[['uID', 'mID', 'rating']].copy()
edges.columns = ['source', 'target', 'rating']
edges['source'] = 'u' + edges['source'].astype(str)
edges['target'] = 'm' + edges['target'].astype(str)

all_movie_ids = set(edges['target'])
available_movie_ids = set(MV_movies.index)
unmatched_movie_ids = all_movie_ids - available_movie_ids

edges = edges[~edges['target'].isin(unmatched_movie_ids)]

valid_sources = set(MV_users.index)
valid_targets = set(MV_movies.index)
edges = edges[edges['source'].isin(valid_sources) & edges['target'].isin(valid_targets)]

# Create the graph
G = sg.StellarGraph(nodes={'user': MV_users, 'movie': MV_movies}, edges=edges, edge_type_column='rating')

# Split the graph into training and test sets
edge_splitter = EdgeSplitter(G)
G_train, edge_ids_train, edge_labels_train = edge_splitter.train_test_split(
    p=0.9,  # 90% of edges for training
    method="global",
    keep_connected=True
)

# Now split the remaining edges for testing
G_test, edge_ids_test, edge_labels_test = edge_splitter.train_test_split(
    p=0.1 / 0.9,  # 10% of the remaining 10% of edges for testing
    method="global",
    keep_connected=True
)

# Create generators
batch_size = 50
num_samples = [10, 5]

# Create the generator for the training graph
train_generator = HinSAGELinkGenerator(G_train, batch_size, num_samples, head_node_types=["user", "movie"])

# Create the generator for the test graph
test_generator = HinSAGELinkGenerator(G_test, batch_size, num_samples, head_node_types=["user", "movie"])

** Sampled 80626 positive and 80626 negative edges. **
** Sampled 9953 positive and 9953 negative edges. **


### HINSAGE GCN Modeling

In [244]:
# Define the HinSAGE model
hinsage = HinSAGE(
    layer_sizes=[32, 32],  # Adjust layer sizes as needed
    generator=generator,
    bias=True,
    dropout=0.3,
)

# Create input and output tensors
x_inp, x_out = hinsage.in_out_tensors()

# Link prediction requires combining embeddings from both nodes
prediction = Dot(axes=1, normalize=False)(x_out)
prediction = Reshape((-1,))(prediction)

# Build the Keras model
model = Model(inputs=x_inp, outputs=prediction)


In [None]:
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='mean_squared_error',
)

Above, GCN was built on the graph model generated via StellarGraph, which handily splits the graph itself into training and test graphs. The model was then compiled and trained on the training data. 

In [None]:
# Filtering out only user-movie edges - needed because user-user, movie-movie edges were created as result of edge splitter
corrected_edges_train = np.array([
    [src, tgt] for src, tgt in edge_ids_train
    if src.startswith('u') and tgt.startswith('m')
])

corrected_labels_train = edge_labels_train[
    [i for i, (src, tgt) in enumerate(edge_ids_train) if src.startswith('u') and tgt.startswith('m')]
]

corrected_edges_test = np.array([
    [src, tgt] for src, tgt in edge_ids_test
    if src.startswith('u') and tgt.startswith('m')
])

corrected_labels_test = edge_labels_test[
    [i for i, (src, tgt) in enumerate(edge_ids_test) if src.startswith('u') and tgt.startswith('m')]
]

# Create the training, testing generator
train_gen = train_generator.flow(corrected_edges, corrected_labels, shuffle=True)
test_flow = test_generator.flow(corrected_edges_test, corrected_labels_test)


Before fitting train graph_gen to the model, I needed to clean up the graph edges, because the edge split function created edges between users and users, movies and movies, which are not needed for this model. I filtered out only the user-movie edges and their corresponding labels. I then created the training and testing generators using the flow method of the train_generator and test_generator objects. The flow method returns a generator that yields batches of samples for training or testing. The flow method takes the edge IDs and labels as input and shuffles the data if needed. The train_gen and test_flow objects are used to fit the model to the training data and evaluate the model on the test data, respectively.

In [237]:
history = model.fit(
    train_gen,
    epochs=20,
    verbose=2,
    validation_data=None,  # Add validation data if available
)


Epoch 1/20
43/43 - 12s - loss: 0.0365
Epoch 2/20
43/43 - 11s - loss: 0.0170
Epoch 3/20
43/43 - 11s - loss: 0.0098
Epoch 4/20
43/43 - 11s - loss: 0.0064
Epoch 5/20
43/43 - 12s - loss: 0.0038
Epoch 6/20
43/43 - 14s - loss: 0.0031
Epoch 7/20
43/43 - 13s - loss: 0.0022
Epoch 8/20
43/43 - 11s - loss: 0.0016
Epoch 9/20
43/43 - 11s - loss: 0.0013
Epoch 10/20
43/43 - 11s - loss: 9.6936e-04
Epoch 11/20
43/43 - 11s - loss: 7.7805e-04
Epoch 12/20
43/43 - 11s - loss: 6.1872e-04
Epoch 13/20
43/43 - 11s - loss: 5.7659e-04
Epoch 14/20
43/43 - 11s - loss: 4.8017e-04
Epoch 15/20
43/43 - 11s - loss: 4.0003e-04
Epoch 16/20
43/43 - 11s - loss: 3.0638e-04
Epoch 17/20
43/43 - 11s - loss: 2.8416e-04
Epoch 18/20
43/43 - 11s - loss: 2.7480e-04
Epoch 19/20
43/43 - 11s - loss: 2.3669e-04
Epoch 20/20
43/43 - 11s - loss: 1.9941e-04


In [247]:
# Evaluate the model's performance on the test set
test_metrics = model.evaluate(test_flow)
print("Test Metrics:", test_metrics)


Test Metrics: 0.07835771888494492


In [248]:
# Assuming 'corrected_labels_test' contains the actual ratings for the test edges
test_targets = corrected_labels_test
# Make predictions using the trained model on the test set
y_pred = model.predict(test_flow)

from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(test_targets, y_pred.flatten()))  # Ensure y_pred is correctly shaped
print("Test RMSE:", rmse)

Test RMSE: 0.2799489922067964


The RMSE result of Test RMSE: 0.2799489922067964 is remarkably good. HINSAGE was chosen over GRAPHSAGE because my graph has 2 node tyoes: users, movies, and GRAPHSAGE can only handle graphs with 1 node type. This was a relatively small data set, on much larger data sets, I am not sure how HINSAGE would perform. Of course, with such a low RMSE the question of overfitting is highly relevant.

# Part 2. Sci-kit Surpise Ensemble Learning

In [258]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD, SVDpp ,accuracy
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNWithZScore
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

### Data Preparation

In [268]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ua[['uID', 'mID','rating']], reader)
pd.DataFrame(data.__dict__['raw_ratings'], columns=['uID', 'mID','rating','timestamp'])

Unnamed: 0,uID,mID,rating,timestamp
0,1,1,5.0,
1,1,2,3.0,
2,1,3,4.0,
3,1,4,3.0,
4,1,5,3.0,
...,...,...,...,...
90565,943,1047,2.0,
90566,943,1074,4.0,
90567,943,1188,3.0,
90568,943,1228,3.0,


Surprise has as specific format for the data, so above I converted the data to that format. I reused the same data imported at top of notebook.

### Model Building

In [None]:
benchmark = []

for algo in [SVD(), SVDpp(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]:
    result = cross_validate(algo, data, cv=5)
    results= pd.DataFrame.from_dict(result).mean(axis=0)
    results = results.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(results)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

I chose SVD(), SVDpp(), KNNBasic(), KNNWithMeans(), KNNWithZScore() as the algorithms to compare, in order to provide a unique set of results as opposed to benchmarking the content-based and collaborative filtering algorithms. I also want to acknowledge this excellent Surpise example that inspired me: https://github.com/jadecebeci/Movie-Recommender-System/blob/main/Movie_Recommender_Modeling.ipynb. The results are as follows:

In [270]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [272]:
surprise_results

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVDpp,0.921214,0.722533,142.534636,2.856097
SVD,0.938674,0.739939,3.695797,0.198495
KNNWithZScore,0.95143,0.746179,0.487074,2.862678
KNNWithMeans,0.951798,0.749973,0.397178,2.842386
KNNBasic,0.983156,0.776521,0.435661,2.673625


# Conclusions

### Discussion
Below is summary of rec sys models tested and their performance:

|Method|RMSE|
|:----|:--------:|
|Baseline, $Y_p$= |1.2585510334053043 |
|Baseline, $Y_p=\mu_u$|1.0352910334228647 |
|Content based, item-item|1.012502820366462 |
|Collaborative, cosine|1.0023580719424758 |
|Collaborative, jaccard, $M_r\geq 3$|1.0400814568358727  |
|Collaborative, jaccard, $M_r\geq 1$|1.0399281016619402  |
|Collaborative, jaccard, $M_r$|1.0399281016619402  |
|SVDpp (Surprise)|0.921214 |
|SVD (Surprise) |0.938674 |
|KNNWithZScore	(Surprise)|0.951430 |
|KNNWithMeans (Surprise)|0.951798 |
|KNNBasic (Surprise)|0.983156 |
|StellarGraph HINSAGE GCN |0.2799489922067964|
