In [373]:

import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
import mlflow
import warnings
import pandas as pd
import pymongo

from pymongo import MongoClient
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from config import mongo_host, mongo_port, mongoUsername, mongoPassword

warnings.filterwarnings("ignore")

## Dataset Split Configuration

This code configures the dataset split for training and testing:

- `use_full_training`: Toggle between the full dataset (`True`) and a smaller subset (`False`) for experiments.
- `full_train_percentage` (0.8): Fraction of the full dataset used for training when using the full dataset.
- `mini_train_percentage` (0.02) and `mini_test_percentage` (0.01): Specify the portions of the dataset for training and testing, respectively, when working with a smaller subset for quicker iterations.


In [None]:
# Configuration
use_full_training = False
full_train_percentage = 0.8  
mini_train_percentage = 0.02  
mini_test_percentage = 0.01 

# Configuration for NMF model
n_components = 2500
max_iter = 100

## NMF Recommender System Overview

This class implements a recommender system using Non-negative Matrix Factorization (NMF) to predict user preferences for items. Key components:

- **Initialization (`__init__`)**: Configures the NMF model with `n_components` (number of features) and `max_iter` (maximum iterations for the solver).
- **Fit (`fit`)**: Learns the user feature matrix (`W`) and item feature matrix (`H`) from the input matrix `X` (user-item interactions).
- **Transform (`transform`)**: Transforms the input matrix `X` to the feature space, useful for finding similar items or users.
- **Predict (`predict`)**: Generates predictions by computing the dot product of `W` and `H`, representing the estimated user-item interactions.

Additionally, it initializes MongoDB connections to fetch `movies` and `users` data, and sets up an MLflow experiment named "moviewise" for tracking.


In [375]:
class NMFRecommender(BaseEstimator, TransformerMixin):
    def __init__(self, n_components={n_components}, max_iter={max_iter}):
        self.n_components = n_components
        self.max_iter = max_iter
        self.model = NMF(n_components=self.n_components, max_iter=self.max_iter)

    def fit(self, X, y=None):
        self.W = self.model.fit_transform(X)
        self.H = self.model.components_ 
        return self

    def transform(self, X):
        return self.model.transform(X)

    def predict(self, X):
        return np.dot(self.W, self.H)

client = pymongo.MongoClient(f"mongodb://{mongoUsername}:{mongoPassword}@{mongo_host}:{mongo_port}/")
db = client['Movielens']
movies = db['movies']
users = db['users']

movies_collection = db['movies']
users_collection = db['users']

mlflow.set_experiment("moviewise")

<Experiment: artifact_location='file:///home/lokman/simplon/00.briefs/11.MovieLensAi/mlruns/684583501083562460', creation_time=1707990507228, experiment_id='684583501083562460', last_update_time=1707990507228, lifecycle_stage='active', name='moviewise', tags={}>

## Data Preparation Workflow

This segment handles the preparation of movie and user data. It checks for the existence of `movies.csv` and `users.csv` files:

- **Data Fetching**: If either file is missing, it fetches data from MongoDB, processes it, and saves it to CSV. This includes:
  - Retrieving movies and user interactions from MongoDB collections.
  - Normalizing user data, unrolling nested `movies` entries, and merging them into a unified DataFrame.
  - Saving the processed DataFrames to CSV files for future use, avoiding repeated database queries.

- **Data Loading**: If both CSV files exist, it directly loads the DataFrames from these files, significantly speeding up the initialization process.

This approach ensures efficient data management by leveraging local storage for quick access and reducing reliance on database queries.


In [376]:
movies_csv_path = 'movies.csv'
users_csv_path = 'users.csv'

if not os.path.exists(movies_csv_path) or not os.path.exists(users_csv_path):
    movies = pd.DataFrame(movies_collection.find({}, {"_id": 1, "title": 1}))
    data = list(users_collection.find({}, {"movies.movieid": 1, "_id": 1, "movies.rating": 1, "movies.timestamp": 1}))

    users = pd.json_normalize(data)
    users = users.explode('movies')
    users_tmp = users['movies'].apply(pd.Series)
    users = pd.concat([users, users_tmp], axis=1).drop('movies', axis=1)
    
    movies.to_csv(movies_csv_path, index=False)
    users.to_csv(users_csv_path, index=False)
    print('Data fetched from MongoDB and saved to CSV.')
else:
    movies = pd.read_csv(movies_csv_path)
    users = pd.read_csv(users_csv_path)
    print('Data loaded from CSV files.')

print('Size of movies:', len(movies))
print('Size of users:', len(users))
users.head()

Data loaded from CSV files.
Size of movies: 3883
Size of users: 1000209


Unnamed: 0,_id,movieid,rating,timestamp
0,6040,573,4,956704056
1,6040,589,4,956704996
2,6040,1,3,957717358
3,6040,2068,4,997453982
4,6040,592,2,956716016


## Data Merging and Preprocessing

This section combines movie and user datasets (`movies` and `users`) into a unified DataFrame `merged_df` and performs preprocessing steps:

1. **Merging Datasets**: Merges `movies` and `users` DataFrames based on the `_id` and `movieid` columns.

2. **Column Operations**:
    - **Drop Column**: Removes the redundant `movieid` column.
    - **Rename Columns**: Renames `_id_x` to `movie_id` and `_id_y` to `user_id` for clarity.

3. **Sorting by Timestamp**: Orders the DataFrame by the `timestamp` column to organize data chronologically.

4. **Reset Index**: Reindexes the DataFrame after sorting to ensure a clean index sequence.

5. **Data Cleaning**: Drops the first row due to dataset irregularities.

These steps ensure the data is properly merged, organized, and prepared for further analysis or model training.


In [377]:
merged_df = movies.merge(users, left_on='_id', right_on='movieid')

merged_df = merged_df.drop(columns=['movieid'])

merged_df = merged_df.rename(columns={"_id_x": "movie_id"})

merged_df = merged_df.rename(columns={"_id_y": "user_id"})

merged_df = merged_df.sort_values(by=['timestamp'])

merged_df = merged_df.reset_index(drop=True)

merged_df = merged_df.drop(merged_df.index[0])

print('Taille de merged_df :', len(merged_df))
merged_df.tail(10)

Taille de merged_df : 1000208


Unnamed: 0,movie_id,title,user_id,rating,timestamp
1000199,3098,"Natural, The (1984)",5948,4,1046437932
1000200,3267,"Mariachi, El (1992)",5312,4,1046444711
1000201,2453,"Boy Who Could Fly, The (1986)",4958,4,1046454260
1000202,2043,Darby O'Gill and the Little People (1959),4958,1,1046454282
1000203,3489,Hook (1991),4958,4,1046454320
1000204,2399,Santa Claus: The Movie (1985),4958,1,1046454338
1000205,1407,Scream (1996),4958,5,1046454443
1000206,2634,"Mummy, The (1959)",4958,3,1046454548
1000207,3264,Buffy the Vampire Slayer (1992),4958,4,1046454548
1000208,1924,Plan 9 from Outer Space (1958),4958,4,1046454590


## Data Filtering by Ratings

This block of code analyzes the distribution of ratings per movie and per user in the `merged_df` DataFrame, and then applies filtering thresholds to remove movies and users with insufficient ratings:

1. **Number of Ratings per Movie**:
    - Computes the frequency of ratings for each movie in the dataset.
    - Provides descriptive statistics summarizing the distribution of movie ratings.

2. **Number of Ratings per User**:
    - Calculates the frequency of ratings given by each user.
    - Presents descriptive statistics summarizing the distribution of user ratings.

3. **Threshold Definition**:
    - Sets thresholds for dropping movies (`movies_threshold`) and users (`users_threshold`) based on the desired minimum number of ratings.

4. **Filtering Operations**:
    - Drops movies with fewer than 33 ratings and users with fewer than 44 ratings, respectively.
    - Filters the `merged_df` DataFrame based on the established thresholds.

These filtering steps help ensure that only movies and users with a sufficient number of ratings are retained for further analysis or modeling. The final size of `merged_df` reflects the dataset after applying these filters.


In [378]:
movies_counts = merged_df['movie_id'].value_counts()
print(movies_counts.describe())

print('\n')

cusers_counts = merged_df['user_id'].value_counts()
print(cusers_counts.describe())

movies_threshold = 33
users_threshold = 44

merged_df = merged_df[merged_df['movie_id'].isin(movies_counts[movies_counts > movies_threshold].index)]

merged_df = merged_df[merged_df['user_id'].isin(cusers_counts[cusers_counts > users_threshold].index)]

print('\n')
print('Nouvelle taille de merged_df :', len(merged_df))

count    3706.000000
mean      269.888829
std       384.046465
min         1.000000
25%        33.000000
50%       123.500000
75%       350.000000
max      3428.000000
Name: count, dtype: float64


count    6040.000000
mean      165.597351
std       192.746879
min        20.000000
25%        44.000000
50%        96.000000
75%       208.000000
max      2314.000000
Name: count, dtype: float64


Nouvelle taille de merged_df : 942215


## Dataset Splitting for Training and Testing

This code snippet divides the dataset (`merged_df`) into training and testing sets based on the specified percentages and configuration:

1. **Configuration Flags**:
    - `use_full_training`: Determines whether to use the full dataset (`True`) or a mini dataset (`False`) for training and testing.

2. **Calculation of Training and Testing Sizes**:
    - If using a mini dataset configuration:
        - Calculates the sizes of the training and testing sets based on the specified percentages (`mini_train_percentage` and `mini_test_percentage`).
        - Slices the `merged_df` DataFrame accordingly to create the training and testing sets (`df_train` and `df_test`).
    - If using a full dataset configuration:
        - Computes the sizes of the training and testing sets based on the specified percentage for training (`full_train_percentage`).
        - Slices the `merged_df` DataFrame accordingly to generate the training and testing sets (`df_train` and `df_test`).

3. **Output**:
    - Displays the sizes of the training and testing sets to provide insight into the distribution of data for model training and evaluation.


In [379]:
if not use_full_training:
    train_size = int(mini_train_percentage * len(merged_df))
    test_size = int(mini_test_percentage * len(merged_df))
    
    df_train = merged_df[:train_size]
    df_test = merged_df[train_size:train_size + test_size]
else:
    train_size = int(full_train_percentage * len(merged_df))
    df_train = merged_df[:train_size]
    df_test = merged_df[train_size:]

print("Training Set Size:", len(df_train))
print("Testing Set Size:", len(df_test))



Training Set Size: 18844
Testing Set Size: 9422


In [380]:
films_df_train = df_train['movie_id'].unique()
df_test = df_test[df_test['movie_id'].isin(films_df_train)]

users_df_train = df_train['user_id'].unique()
df_test = df_test[df_test['user_id'].isin(users_df_train)]


print(len(df_train))
print(len(df_test))

18844
1831


In [381]:
ratings_train = df_train.pivot(index='user_id', columns='movie_id', values='rating')

ratings_train

movie_id,1,2,3,4,5,6,7,8,10,11,...,3593,3598,3600,3602,3604,3605,3606,3608,3610,3614
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5880,,,,,,,,,,,...,,,,,,,,,,3.0
5881,,,,,,,,,,3.0,...,,,,,,,,,,
5884,,,,,,,,,,,...,,,,,,,,,,
5885,,,,,,,,,4.0,,...,,,,,,,,,,
5886,4.0,2.0,,,,4.0,,,3.0,3.0,...,,,,,,,,3.0,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,4.0,,1.0,2.0,1.0,,3.0,,,4.0,...,,,,,,,,,,
6036,,,,2.0,,3.0,,,,3.0,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [382]:
ratings_train = ratings_train.fillna(0)

ratings_train = ratings_train[ratings_train.sum(axis=1) > 0]

ratings_train_sparse = ratings_train.astype(pd.SparseDtype("float", 0))

ratings_train_sparse

movie_id,1,2,3,4,5,6,7,8,10,11,...,3593,3598,3600,3602,3604,3605,3606,3608,3610,3614
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5880,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.0
5881,0,0,0,0,0,0,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,0
5884,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5885,0,0,0,0,0,0,0,0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
5886,4.0,2.0,0,0,0,4.0,0,0,3.0,3.0,...,0,0,0,0,0,0,0,3.0,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,4.0,0,1.0,2.0,1.0,0,3.0,0,0,4.0,...,0,0,0,0,0,0,0,0,0,0
6036,0,0,0,2.0,0,3.0,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [383]:
nmf = NMF(n_components=n_components, max_iter=max_iter, verbose=True)

U_train = nmf.fit_transform(ratings_train_sparse)
M = nmf.components_ 
pred_matrix = np.dot(U_train, M)


violation: 1.0
violation: 0.0954606032233988
violation: 4243524227.3878026
violation: 2169778521.965833
violation: 2496181202.8422008
violation: 1723575216.4554896
violation: 1483910597.7688498
violation: 1164193045.4505284
violation: 885368607.2131099
violation: 683535138.0612382
violation: 590815431.0054493
violation: 553382497.2670192
violation: 457364176.68162626
violation: 386030052.52926105
violation: 331323584.88743514
violation: 280287604.5475339
violation: 244067552.96510854
violation: 210520104.0255444
violation: 180453795.00480846
violation: 153467193.608261
violation: 132354356.07009766
violation: 113851032.60822406
violation: 97720600.89703509
violation: 83692841.05909899
violation: 71373254.01242858
violation: 58143609.158185914
violation: 45870017.3275452
violation: 38561711.38803037
violation: 32892316.139885228
violation: 26084707.907532103
violation: 22720730.172420073
violation: 19953592.431425434
violation: 17134032.47135455
violation: 14391006.89584091
violation: 1

### "unpivot" the matrix returned to get

In [None]:
pred_df = pd.DataFrame(pred_matrix, columns=ratings_train.columns, index=ratings_train.index)

pred_df

### Stack the dataframe

In [None]:
pred_df = pred_df.stack().reset_index()
pred_df.columns = ['user_id', 'movie_id', 'user_movie_position'] # Rename columns

### Merge the train and test dataframes with the predictions dataframe

In [None]:
train_pred_df = pd.merge(df_train, pred_df, on=['user_id', 'movie_id'])
test_pred_df = pd.merge(df_test, pred_df, on=['user_id', 'movie_id'])

test_pred_df

In [None]:
mse_train = mean_squared_error(train_pred_df['rating'], train_pred_df['user_movie_position'])
mse_test = mean_squared_error(test_pred_df['rating'], test_pred_df['user_movie_position'])
pred_df

In [None]:
with mlflow.start_run() as run:
    mlflow.sklearn.log_model(NMFRecommender, "NMFRecommender", registered_model_name="NMFRecommenderModel")
    mlflow.log_params({"n_components": n_components, "max_iter": max_iter})
    mlflow.log_metric("Training MSE", mse_train)
    mlflow.log_metric("Test MSE", mse_test)

In [None]:
test_pred_df = test_pred_df.sort_values(by=['user_id', 'user_movie_position'], ascending=[True, False]).reset_index(drop=True)
top_10_df = test_pred_df.groupby('user_id').head(10)

grouped = top_10_df.groupby('user_id')
spearman_results = pd.DataFrame(columns=['user_id', 'spearman_corr'])
for user, group in grouped:
    spearman_corr = group['rating'].corr(group['user_movie_position'], method='spearman')
    spearman_results.loc[len(spearman_results)] = [user, spearman_corr]

spearman_results


### For each group, calculate the Pearson correlation coefficient and add it to the results dataframe


In [None]:
pearson_results = pd.DataFrame(columns=['user_id', 'pearson_corr'])
for user, group in grouped:
    pearson_corr = group['rating'].corr(group['user_movie_position'], method='pearson')
    pearson_results.loc[len(pearson_results)] = [user, pearson_corr]

pearson_results

In [16]:
from normalize_matrix import normalize_matrix

####################################################################
generate_dummy_matrix = lambda: np.random.randint(1, 10, (3, 3))   #
matrix = generate_dummy_matrix()                                   #
####################################################################

options = {"min_range":1, "max_range":5}
normalize_matrix(matrix, options)

array([[5., 3., 2.],
       [2., 4., 3.],
       [4., 1., 1.]])