In [1]:
import surprise
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score
from surprise import SVDpp, Reader, Dataset
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise import accuracy
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import LabelEncoder

In [2]:
df=pd.read_csv('data_movies.csv')
df.head()

Unnamed: 0,UserID,Cr1,Cr2,Cr3,Cr4,Overall rating,MovieID,Frequency
0,1.0,6.0,6.0,8.0,12.0,8.0,2.0,1.0
1,,,,,,,,
2,1.0,9.0,11.0,10.0,9.0,10.0,26.0,2.0
3,,,,,,,,
4,1.0,6.0,10.0,9.0,8.0,7.0,61.0,3.0


In [3]:
df = df.dropna()

In [4]:
df_=df[['UserID','MovieID','Cr1','Cr2','Cr3','Cr4','Overall rating']]
df_.head()

Unnamed: 0,UserID,MovieID,Cr1,Cr2,Cr3,Cr4,Overall rating
0,1.0,2.0,6.0,6.0,8.0,12.0,8.0
2,1.0,26.0,9.0,11.0,10.0,9.0,10.0
4,1.0,61.0,6.0,10.0,9.0,8.0,7.0
6,1.0,86.0,6.0,6.0,6.0,5.0,5.0
8,1.0,132.0,10.0,11.0,10.0,9.0,10.0


In [5]:
# most popular projects based on the overall rating.
# by doing this, we want to reduce the noise, the new_df will be used for recommedation
new_df=df_.groupby('MovieID').filter(lambda x:x['Overall rating'].count() >=50)
new_df.head()

Unnamed: 0,UserID,MovieID,Cr1,Cr2,Cr3,Cr4,Overall rating
0,1.0,2.0,6.0,6.0,8.0,12.0,8.0
2,1.0,26.0,9.0,11.0,10.0,9.0,10.0
4,1.0,61.0,6.0,10.0,9.0,8.0,7.0
10,1.0,163.0,11.0,12.0,12.0,12.0,12.0
12,1.0,180.0,9.0,13.0,11.0,13.0,11.0


In [6]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50589 entries, 0 to 124310
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   UserID          50589 non-null  float64
 1   MovieID         50589 non-null  float64
 2   Cr1             50589 non-null  float64
 3   Cr2             50589 non-null  float64
 4   Cr3             50589 non-null  float64
 5   Cr4             50589 non-null  float64
 6   Overall rating  50589 non-null  float64
dtypes: float64(7)
memory usage: 3.1 MB


In [7]:
# Label encode UserID starting from 0
user_id_encoder = LabelEncoder()
new_df['UserID'] = user_id_encoder.fit_transform(new_df['UserID'])

item_encoder = LabelEncoder()
new_df['MovieID'] = item_encoder.fit_transform(new_df['MovieID'])

# Create a central split
train_indices, test_indices = train_test_split(np.arange(len(new_df)), test_size=0.2, random_state=42)


In [27]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Flatten, concatenate, Lambda, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD, AdamW
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dot, Add
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
import inspect
import math

# Load your dataset into a pandas DataFrame
df = new_df.copy()

# Define model parameters
num_users = df['UserID'].nunique()
num_items = df['MovieID'].nunique()
embedding_dim = 30

# Use the indices to split the data
df_train = df.iloc[train_indices]
df_test = df.iloc[test_indices]

# Separate input dimensions for categorical variables
user_input_dim = num_users + 1
item_input_dim = num_items + 1


def DeepFM():
    # Define input layers
    inputs = [Input(shape=(1,), name='user_id'),
              Input(shape=(1,), name='MovieID'),
              Input(shape=(1,), name='Cr1'),
              Input(shape=(1,), name='Cr2'),
              Input(shape=(1,), name='Cr3'),
              Input(shape=(1,), name='Cr4')]

    # Define embedding layers with l2 regularization
    embedding_dims = [user_input_dim, item_input_dim]
    embedding_layers = []
    
    for i, input_layer in enumerate(inputs[:2]):
        embedding_layer = Embedding(input_dim=embedding_dims[i], output_dim=embedding_dim,
                                    input_length=1, embeddings_regularizer=l2(0.01))(input_layer)
        embedding_layers.append(embedding_layer)

    embedding_layers_flat = [Flatten()(embedding_layer) for embedding_layer in embedding_layers]

    # Flatten the embedding layers for Cr1, Cr2, Cr3, Cr4
    embedding_layers_flat += [Flatten()(embedding_layer) for embedding_layer in embedding_layers[2:]]

    # Include bias terms for user and item embeddings
    user_bias = Embedding(input_dim=user_input_dim, output_dim=1, input_length=1)(inputs[1])
    item_bias = Embedding(input_dim=item_input_dim, output_dim=1, input_length=1)(inputs[2])
    user_bias = Flatten()(user_bias)
    item_bias = Flatten()(item_bias)

    # Concatenate the embeddings and additional rating features
    concatenated = concatenate(embedding_layers_flat + inputs[2:])

    # Define a dense layer for the concatenated inputs and add L2 reguarization to the layers
    dense = Dense(units=128, activation='relu', kernel_regularizer=l2(0.01))(concatenated)
    
    # Include user and item bias terms in FM component
    user_item_bias_dot = Dot(axes=1)([user_bias, item_bias])
    
    # FM component with bias terms
    fm_components = []
    for i in range(len(embedding_layers)):
        for j in range(i + 1, len(embedding_layers)):
            dot_product = Dot(axes=1)([embedding_layers_flat[i], embedding_layers_flat[j]])
            fm_components.append(dot_product)
    
    fm_components.append(user_item_bias_dot)  # Add user-item bias term
    # FM component: Pairwise interactions of embeddings
    fm_components = []
    
    for i in range(len(embedding_layers)):
        for j in range(i + 1, len(embedding_layers)):
            dot_product = Dot(axes=1)([embedding_layers_flat[i], embedding_layers_flat[j]])
            fm_components.append(dot_product)
    
    fm_part = Add()(fm_components)
    
    # Combine FM and DNN components
    combined_output = concatenate([fm_part, dense]) 
    
    # Output layer
    overall_output = Dense(1, activation='linear', name='overall_rating')(combined_output)

    # Clip the predicted ratings to the desired range (1 to 13)
    rating_min = 1
    rating_max = 13
    clipped_output = Lambda(lambda x: tf.clip_by_value(x, rating_min, rating_max))(overall_output)

    return Model(inputs=inputs, outputs=clipped_output)



# Prepare your data in the necessary format
def prepare_data(df):
    # Convert app_rating, data_rating, and ease_rating to float
    df[['Cr1', 'Cr2', 'Cr3', 'Cr4']] = df[['Cr1', 'Cr2', 'Cr3', 'Cr4']].astype(float)
    
    user_ids = df['UserID'].values
    item_ids = df['MovieID'].values
    Cr1 = df['Cr1'].values
    Cr2 = df['Cr2'].values
    Cr3 = df['Cr3'].values
    Cr4 = df['Cr4'].values
    
    X = np.column_stack((user_ids, item_ids, Cr1, Cr2, Cr3, Cr4))
    y = None
    if 'Overall rating' in df.columns:
        y = df['Overall rating'].values
    
    return X, y

X_train, y_train = prepare_data(df_train)
X_test, y_test = prepare_data(df_test)

# Cross Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
mae_scores = []
rmse_scores = []
best_model = None
best_val_loss = float('inf')

for train_index, val_index in kf.split(X_train):
    X_fold_train, X_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_val = y_train[train_index], y_train[val_index]
   
    model_ = DeepFM()
    model_.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0005)
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    history = model_.fit([X_fold_train[:, 0], X_fold_train[:, 1], X_fold_train[:, 2], X_fold_train[:, 3], 
                         X_fold_train[:, 4], X_fold_train[:, 5]], y_fold_train, 
                        validation_data=([X_val[:, 0], X_val[:, 1], X_val[:, 2], X_val[:, 3],
                                          X_val[:, 4], X_val[:, 5]], y_val),
                        epochs=50, batch_size=128, verbose=2, callbacks=[early_stop, reduce_lr])

    # saving the best model based on validation loss
    val_loss = min(history.history['val_loss'])
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model_

# Now evaluate the best model on the test data
test_loss, test_mae = best_model.evaluate(
    [X_test[:, i] for i in range(X_test.shape[1])], y_test, verbose=2)
print(f'Test MAE: {test_mae:.4f}')

y_pred = best_model.predict(
    [X_test[:, i] for i in range(X_test.shape[1])])
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print(f'Test RMSE: {rmse:.4f}')

#saving predictions part to use X_test
original_user_ids = X_test[:, 0]
original_item_ids = X_test[:, 1]
overall_rating_predictions = pd.DataFrame({
    'UserID': original_user_ids,  
    'MovieID': original_item_ids,  
    'Overall_rating_test': y_pred.flatten()  
})

overall_rating_predictions.to_csv(f'DeepFM_prediction_cv10.csv', index=False)

Epoch 1/50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


285/285 - 1s - loss: 87.8387 - mae: 8.7030 - val_loss: 88.2114 - val_mae: 8.7520 - lr: 0.0010 - 1s/epoch - 5ms/step
Epoch 2/50
285/285 - 1s - loss: 87.6022 - mae: 8.7030 - val_loss: 88.2105 - val_mae: 8.7520 - lr: 0.0010 - 654ms/epoch - 2ms/step
Epoch 3/50
285/285 - 1s - loss: 87.6021 - mae: 8.7030 - val_loss: 88.2105 - val_mae: 8.7520 - lr: 0.0010 - 519ms/epoch - 2ms/step
Epoch 4/50
285/285 - 0s - loss: 87.6021 - mae: 8.7030 - val_loss: 88.2105 - val_mae: 8.7520 - lr: 0.0010 - 487ms/epoch - 2ms/step
Epoch 5/50
285/285 - 1s - loss: 87.6021 - mae: 8.7030 - val_loss: 88.2105 - val_mae: 8.7520 - lr: 0.0010 - 534ms/epoch - 2ms/step
Epoch 6/50
285/285 - 1s - loss: 87.6021 - mae: 8.7030 - val_loss: 88.2105 - val_mae: 8.7520 - lr: 0.0010 - 563ms/epoch - 2ms/step
Epoch 7/50
285/285 - 0s - loss: 87.6021 - mae: 8.7030 - val_loss: 88.2105 - val_mae: 8.7520 - lr: 0.0010 - 489ms/epoch - 2ms/step
Epoch 8/50
285/285 - 1s - loss: 87.6021 - mae: 8.7030 - val_loss: 88.2105 - val_mae: 8.7520 - lr: 5.0000

Epoch 2/50
285/285 - 0s - loss: 1.3033 - mae: 0.6458 - val_loss: 1.1924 - val_mae: 0.6316 - lr: 0.0010 - 496ms/epoch - 2ms/step
Epoch 3/50
285/285 - 0s - loss: 1.2984 - mae: 0.6482 - val_loss: 1.1854 - val_mae: 0.6353 - lr: 0.0010 - 466ms/epoch - 2ms/step
Epoch 4/50
285/285 - 0s - loss: 1.2903 - mae: 0.6434 - val_loss: 1.1813 - val_mae: 0.6304 - lr: 0.0010 - 468ms/epoch - 2ms/step
Epoch 5/50
285/285 - 0s - loss: 1.2892 - mae: 0.6443 - val_loss: 1.1885 - val_mae: 0.6450 - lr: 0.0010 - 461ms/epoch - 2ms/step
Epoch 6/50
285/285 - 0s - loss: 1.2873 - mae: 0.6450 - val_loss: 1.1994 - val_mae: 0.6753 - lr: 0.0010 - 465ms/epoch - 2ms/step
Epoch 7/50
285/285 - 1s - loss: 1.2857 - mae: 0.6447 - val_loss: 1.1754 - val_mae: 0.6279 - lr: 0.0010 - 509ms/epoch - 2ms/step
Epoch 8/50
285/285 - 0s - loss: 1.2878 - mae: 0.6465 - val_loss: 1.1764 - val_mae: 0.6291 - lr: 0.0010 - 446ms/epoch - 2ms/step
Epoch 9/50
285/285 - 0s - loss: 1.2828 - mae: 0.6453 - val_loss: 1.1753 - val_mae: 0.6279 - lr: 0.0010 -

Epoch 8/50
285/285 - 0s - loss: 87.7656 - mae: 8.7150 - val_loss: 86.7386 - val_mae: 8.6442 - lr: 5.0000e-04 - 454ms/epoch - 2ms/step
Epoch 9/50
285/285 - 0s - loss: 87.7656 - mae: 8.7150 - val_loss: 86.7386 - val_mae: 8.6442 - lr: 5.0000e-04 - 473ms/epoch - 2ms/step
Epoch 10/50
285/285 - 0s - loss: 87.7656 - mae: 8.7150 - val_loss: 86.7386 - val_mae: 8.6442 - lr: 5.0000e-04 - 444ms/epoch - 2ms/step
Epoch 11/50
285/285 - 0s - loss: 87.7656 - mae: 8.7150 - val_loss: 86.7386 - val_mae: 8.6442 - lr: 5.0000e-04 - 447ms/epoch - 2ms/step
Epoch 12/50
285/285 - 0s - loss: 87.7656 - mae: 8.7150 - val_loss: 86.7386 - val_mae: 8.6442 - lr: 5.0000e-04 - 444ms/epoch - 2ms/step
Epoch 1/50
285/285 - 1s - loss: 87.9168 - mae: 8.7105 - val_loss: 87.4931 - val_mae: 8.6847 - lr: 0.0010 - 1s/epoch - 4ms/step
Epoch 2/50
285/285 - 0s - loss: 87.6820 - mae: 8.7105 - val_loss: 87.4922 - val_mae: 8.6847 - lr: 0.0010 - 467ms/epoch - 2ms/step
Epoch 3/50
285/285 - 0s - loss: 87.6819 - mae: 8.7105 - val_loss: 87.4

Epoch 47/50
285/285 - 0s - loss: 1.2562 - mae: 0.6439 - val_loss: 1.1405 - val_mae: 0.6077 - lr: 5.0000e-04 - 455ms/epoch - 2ms/step
Epoch 48/50
285/285 - 0s - loss: 1.2555 - mae: 0.6447 - val_loss: 1.1606 - val_mae: 0.6135 - lr: 5.0000e-04 - 447ms/epoch - 2ms/step
Epoch 49/50
285/285 - 0s - loss: 1.2550 - mae: 0.6436 - val_loss: 1.1646 - val_mae: 0.6618 - lr: 5.0000e-04 - 443ms/epoch - 2ms/step
Epoch 50/50
285/285 - 0s - loss: 1.2547 - mae: 0.6416 - val_loss: 1.1399 - val_mae: 0.6078 - lr: 5.0000e-04 - 482ms/epoch - 2ms/step
Epoch 1/50
285/285 - 1s - loss: 87.6708 - mae: 8.6922 - val_loss: 89.6636 - val_mae: 8.8493 - lr: 0.0010 - 1s/epoch - 4ms/step
Epoch 2/50
285/285 - 0s - loss: 87.4408 - mae: 8.6922 - val_loss: 89.6627 - val_mae: 8.8493 - lr: 0.0010 - 472ms/epoch - 2ms/step
Epoch 3/50
285/285 - 0s - loss: 87.4408 - mae: 8.6922 - val_loss: 89.6627 - val_mae: 8.8493 - lr: 0.0010 - 449ms/epoch - 2ms/step
Epoch 4/50
285/285 - 0s - loss: 87.4408 - mae: 8.6922 - val_loss: 89.6627 - val_m

Epoch 36/50
285/285 - 0s - loss: 1.2525 - mae: 0.6421 - val_loss: 1.2168 - val_mae: 0.6733 - lr: 5.0000e-04 - 450ms/epoch - 2ms/step
Epoch 37/50
285/285 - 0s - loss: 1.2532 - mae: 0.6436 - val_loss: 1.2000 - val_mae: 0.6373 - lr: 5.0000e-04 - 445ms/epoch - 2ms/step
Epoch 38/50
285/285 - 0s - loss: 1.2574 - mae: 0.6482 - val_loss: 1.2079 - val_mae: 0.6224 - lr: 5.0000e-04 - 474ms/epoch - 2ms/step
Epoch 39/50
285/285 - 0s - loss: 1.2517 - mae: 0.6412 - val_loss: 1.2043 - val_mae: 0.6226 - lr: 5.0000e-04 - 468ms/epoch - 2ms/step
Epoch 40/50
285/285 - 1s - loss: 1.2516 - mae: 0.6408 - val_loss: 1.2016 - val_mae: 0.6423 - lr: 5.0000e-04 - 538ms/epoch - 2ms/step
Epoch 41/50
285/285 - 0s - loss: 1.2520 - mae: 0.6422 - val_loss: 1.2014 - val_mae: 0.6221 - lr: 5.0000e-04 - 494ms/epoch - 2ms/step
Epoch 42/50
285/285 - 0s - loss: 1.2490 - mae: 0.6398 - val_loss: 1.1962 - val_mae: 0.6256 - lr: 5.0000e-04 - 452ms/epoch - 2ms/step
Epoch 43/50
285/285 - 0s - loss: 1.2496 - mae: 0.6412 - val_loss: 1.2

In [9]:
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import GridSearchCV
import pandas as pd

# We'll extract each criterion rating and train a separate SVD++ model for each of them.
new_df = new_df.copy()

criterion_names = ['Cr1', 'Cr2', 'Cr3', 'Cr4']

# Create a reader object with rating_scale (assuming ratings are between 1 to 13)
reader = Reader(rating_scale=(1, 13))

# Placeholder for best estimators for each criterion
best_estimators = {}
predictions_dfs = {}

for criterion in criterion_names:
    # Convert the DataFrame to Surprise dataset format using only 'UserID', 'Item', and the current criterion
    data_surprise = Dataset.load_from_df(new_df[['UserID', 'MovieID', criterion]], reader)

    # Convert the entire dataset into a trainset format to facilitate manual train/test splitting
    full_trainset = data_surprise.build_full_trainset()

    # Manual train/test split using the given indices
    train_rows = new_df.iloc[train_indices]
    test_rows = new_df.iloc[test_indices]
    trainset = Dataset.load_from_df(train_rows[['UserID', 'MovieID', criterion]], reader).build_full_trainset()
    testset = [(row['UserID'], row['MovieID'], row[criterion]) for _, row in test_rows.iterrows()]

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_epochs': [10, 20, 30, 40, 50],
        'lr_all': [0.0005, 0.005, 0.008, 0.01, 0.05],
        'reg_all': [0.01, 0.02, 0.04, 0.06, 0.1, 0.2]
    }

    # Initialize GridSearchCV with the SVD++ algorithm and the parameter grid
    gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=10)

    # Fit the GridSearchCV to the training data
    gs.fit(data_surprise)

    # Get the best parameters and estimator from the grid search
    best_params = gs.best_params['rmse']
    best_estimator = gs.best_estimator['rmse']

    # Save the best estimator for the current criterion
    best_estimators[criterion] = best_estimator

    # Print the best parameters for the current criterion
    print(f'Best Parameters for {criterion} using svdpp: {best_params}')

    # Train the best estimator on the training set
    best_estimator.fit(trainset)
    
    # Make predictions for the test set
    predictions_svdpp = best_estimator.test(testset)

    # Evaluate the model
    from surprise import accuracy
    accuracy.rmse(predictions_svdpp)
    # Define the threshold RMSE values

    # Create a DataFrame from predictions_svd
    predictions_df = pd.DataFrame(predictions_svdpp, columns=["UserID", "MovieID", "Rating", "Est", "Details"])
    
    # Store the DataFrame for the current criterion
    predictions_dfs[criterion] = predictions_df

    # Save to CSV
    predictions_df.to_csv(f'predictions_cv10_{criterion}.csv', index=False)
    
    print(f'Saved predictions for {criterion} to predictions_{criterion}.csv')


Best Parameters for Cr1 using svdpp: {'n_epochs': 10, 'lr_all': 0.008, 'reg_all': 0.1}
RMSE: 2.7636
Saved predictions for Cr1 to predictions_Cr1.csv
Best Parameters for Cr2 using svdpp: {'n_epochs': 10, 'lr_all': 0.008, 'reg_all': 0.1}
RMSE: 2.5141
Saved predictions for Cr2 to predictions_Cr2.csv
Best Parameters for Cr3 using svdpp: {'n_epochs': 10, 'lr_all': 0.008, 'reg_all': 0.1}
RMSE: 2.7326
Saved predictions for Cr3 to predictions_Cr3.csv
Best Parameters for Cr4 using svdpp: {'n_epochs': 10, 'lr_all': 0.008, 'reg_all': 0.1}
RMSE: 2.5636
Saved predictions for Cr4 to predictions_Cr4.csv


In [28]:
# Retrieve the single criterion ratings predicted by svd++ 
rating_cr1 = pd.read_csv("predictions_cv10_Cr1.csv")
rating_cr2 = pd.read_csv("predictions_cv10_Cr2.csv")
rating_cr3 = pd.read_csv("predictions_cv10_Cr3.csv")
rating_cr4 = pd.read_csv("predictions_cv10_Cr4.csv")

print(rating_cr1.shape, rating_cr2.shape, rating_cr3.shape, rating_cr4.shape)

(10118, 5) (10118, 5) (10118, 5) (10118, 5)


In [29]:
rating_cr1 = rating_cr1.drop(['Rating','Details'], axis = 1)
rating_cr2 = rating_cr2.drop(['Rating','Details'], axis = 1)
rating_cr3 = rating_cr3.drop(['Rating','Details'], axis = 1)
rating_cr4= rating_cr4.drop(['Rating','Details'], axis = 1)

In [30]:
rating_cr1 = rating_cr1.rename(columns={'Est':'Cr1'})
rating_cr2 = rating_cr2.rename(columns={'Est':'Cr2'})
rating_cr3 = rating_cr3.rename(columns={'Est':'Cr3'})
rating_cr4= rating_cr4.rename(columns={'Est':'Cr4'})
rating_cr4.head()

Unnamed: 0,UserID,MovieID,Cr4
0,1603.0,344.0,12.014371
1,3182.0,351.0,11.201449
2,986.0,34.0,9.011319
3,4877.0,78.0,9.740197
4,440.0,212.0,10.882823


In [31]:
merged_df = pd.merge(rating_cr1, rating_cr2, on=['UserID','MovieID'], how='inner')
merged_df = pd.merge(merged_df, rating_cr3, on=['UserID','MovieID'], how='inner')
merged_df = pd.merge(merged_df, rating_cr4, on=['UserID','MovieID'], how='inner')

merged_df.shape

(10118, 6)

In [32]:
merged_df.tail(10)

Unnamed: 0,UserID,MovieID,Cr1,Cr2,Cr3,Cr4
10108,2914.0,180.0,11.167881,12.458519,11.606266,12.232017
10109,3414.0,243.0,10.670886,9.800635,9.037264,10.238819
10110,1933.0,143.0,10.21206,10.547109,9.611898,9.550647
10111,2801.0,123.0,10.547446,10.184538,10.247824,11.404097
10112,4413.0,12.0,12.129064,11.578515,11.787942,12.10587
10113,5147.0,303.0,12.667224,12.327421,12.97446,13.0
10114,5492.0,72.0,6.636187,4.800916,3.774197,5.004955
10115,5798.0,120.0,7.369267,9.453452,8.403788,9.84
10116,1067.0,317.0,11.930094,12.649473,12.530102,12.320379
10117,1407.0,356.0,8.49769,8.147343,7.903776,9.537482


In [33]:
overall_rating = pd.read_csv("DeepFM_prediction_cv10.csv")
overall_rating.head()

Unnamed: 0,UserID,MovieID,Overall_rating_test
0,1603.0,344.0,12.177798
1,3182.0,351.0,9.431672
2,986.0,34.0,9.985678
3,4877.0,78.0,11.696533
4,440.0,212.0,12.531322


In [34]:
def predicted_aggregated_ratings(y_true, df, deepfm_model):
    # Prepare the data using your prepare_data method
    X, _ = prepare_data(df)  # You only need the feature matrix X
    
    X = np.array(X)
    
    # Ensure these are flattened as your model expects 1D arrays
    user_ids, item_ids, Cr1, Cr2, Cr3, Cr4 = [X[:, i] for i in range(6)]

    # Prepare inputs for your model
    inputs = [user_ids, item_ids, Cr1, Cr2, Cr3, Cr4]
    
    # Predict the ratings
    predictions = deepfm_model.predict(inputs)
    
    # Get the last prediction, which corresponds to the overall rating
    predicted_ratings = predictions.flatten()
    item_ids = df['MovieID'].values
    user_ids = df['UserID'].values
    ratings = y_true['Overall rating'].values
 
    predicted_ratings_df = pd.DataFrame({'UserID': user_ids, 'MovieID': item_ids, 'Predicted_Rating':predicted_ratings, 'Overall rating':ratings})
    
    return predicted_ratings_df


In [35]:
y_true=df_test[['UserID','MovieID','Overall rating']]
y_pred=predicted_aggregated_ratings(y_true, merged_df, model_)

y_pred.head()



Unnamed: 0,UserID,MovieID,Predicted_Rating,Overall rating
0,1603.0,344.0,10.859381,12.0
1,3182.0,351.0,9.992766,9.0
2,986.0,34.0,7.345503,10.0
3,4877.0,78.0,10.59069,12.0
4,440.0,212.0,11.772307,13.0


In [36]:
pred_df = pd.merge(y_pred, overall_rating, on=['UserID','MovieID'], how='inner')
pred_df=pred_df[['UserID','MovieID', 'Overall rating', 'Overall_rating_test', 'Predicted_Rating']]

# Check the distribution of features and target variable in the new data
pred_df.describe()

Unnamed: 0,UserID,MovieID,Overall rating,Overall_rating_test,Predicted_Rating
count,10118.0,10118.0,10118.0,10118.0,10118.0
mean,3005.177901,187.807867,9.671575,9.715253,9.884755
std,1751.481112,103.928728,3.470368,3.341748,2.12964
min,0.0,0.0,1.0,1.0,1.0
25%,1490.0,103.0,8.0,8.274809,8.811824
50%,3014.0,189.0,11.0,10.848649,10.457968
75%,4498.5,275.0,12.0,12.238329,11.365645
max,6071.0,367.0,13.0,13.0,13.0


In [37]:
# Combine the original DeepFM predictions made on the test set (Overall_rating) with the new DeepFM predictions made on the SVD++ output.
pred_df['Avg_Predicted_Rating'] = (pred_df['Overall_rating_test']+ pred_df['Predicted_Rating'])/2
pred_df.head()

Unnamed: 0,UserID,MovieID,Overall rating,Overall_rating_test,Predicted_Rating,Avg_Predicted_Rating
0,1603.0,344.0,12.0,12.177798,10.859381,11.518589
1,3182.0,351.0,9.0,9.431672,9.992766,9.712219
2,986.0,34.0,10.0,9.985678,7.345503,8.665591
3,4877.0,78.0,12.0,11.696533,10.59069,11.143611
4,440.0,212.0,13.0,12.531322,11.772307,12.151815


In [38]:
def get_top_n_recommendations(y_pred, n):
    user_ids = y_pred['UserID'].values
    
    predicted_ratings_df = y_pred
    
    # This will sort the predictions within each group, then take the top n for each group
    top_n_recommendations = predicted_ratings_df.groupby('UserID').apply(lambda x: x.nlargest(n, 'Avg_Predicted_Rating')).reset_index(drop=True)
    return top_n_recommendations


# Get the top N recommendations (e.g., top 10 recommendations)
N = 5  # Replace with the number of top recommendations you want   
# Usage:
y_pred = pred_df
top_N_recommendations= get_top_n_recommendations(y_pred, N)
top_N_recommendations

Unnamed: 0,UserID,MovieID,Overall rating,Overall_rating_test,Predicted_Rating,Avg_Predicted_Rating
0,0.0,70.0,11.0,10.912377,10.805765,10.859071
1,0.0,91.0,10.0,9.249566,9.804258,9.526912
2,0.0,82.0,9.0,9.279712,9.404637,9.342175
3,0.0,329.0,9.0,8.484217,9.620522,9.052369
4,0.0,356.0,5.0,6.779758,8.451222,7.615490
...,...,...,...,...,...,...
9179,6067.0,165.0,11.0,11.060246,11.840624,11.450435
9180,6069.0,158.0,13.0,13.000000,11.835013,12.417507
9181,6069.0,356.0,13.0,13.000000,11.222615,12.111308
9182,6069.0,137.0,10.0,10.673219,10.342790,10.508004


In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, recall_score, precision_score, f1_score, auc, average_precision_score
from sklearn.metrics import roc_curve

def evaluation_metrics(y_pred, n):
    top_n = get_top_n_recommendations(y_pred, n)
    true_ratings = top_n['Overall rating'].values
    prediction_ratings = top_n['Avg_Predicted_Rating'].values
    
    # Convert ratings to binary (e.g., positive/negative) for classification metrics
    binary_true_ratings = (true_ratings > 9).astype(int)
    binary_predictions = (np.array(prediction_ratings) > 9).astype(int)


    rmse = np.sqrt(mean_squared_error(true_ratings, prediction_ratings))

    # MAE (Mean Absolute Error)
    mae = mean_absolute_error(true_ratings, prediction_ratings)

    # Recall
    recall = recall_score(binary_true_ratings, binary_predictions)

    # Precision
    precision = precision_score(binary_true_ratings, binary_predictions)


    # F1-score
    f1 = f1_score(binary_true_ratings, binary_predictions)


    prediction_ratings = np.array(prediction_ratings)
    # Create a mask for pairs where true_rating1 < true_rating2
    mask = true_ratings[:, np.newaxis] < true_ratings

    # Calculate FCP using vectorized operations
    concordant_pairs = np.sum(mask * (prediction_ratings[:, np.newaxis] < prediction_ratings))
    discordant_pairs = np.sum(mask * (prediction_ratings[:, np.newaxis] > prediction_ratings))

    # Calculate FCP
    fcp = concordant_pairs / (concordant_pairs + discordant_pairs)


    # MAP (Mean Average Precision)
    avg_precision = average_precision_score(binary_true_ratings, prediction_ratings)

    # AUC (Area Under the Curve)
    fpr, tpr, _ = roc_curve(binary_true_ratings, prediction_ratings)
    auc_score = auc(fpr, tpr)



    print(f'RMSE: {rmse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F-Measure: {f1:.4f}')
    print(f'MAP: {avg_precision:.4f}')
    print(f"AUC: {auc_score: .4f}")
    print(f'FCP: {fcp: .4f}')
    #print(f'NDCG: {ndcg: .4f}')

    
evaluation_metrics(y_pred, n=10)

RMSE: 1.6465
MAE: 1.1844
Precision: 0.8743
Recall: 0.9660
F-Measure: 0.9178
MAP: 0.9735
AUC:  0.9569
FCP:  0.8896


### Top 10 5-Fold
RMSE: 1.6419
MAE: 1.2071
Precision: 0.8812
Recall: 0.9652
F-Measure: 0.9213
MAP: 0.9751
AUC:  0.9597
FCP:  0.8959

### Top 20 5-Fold
RMSE: 1.6434
MAE: 1.2092
Precision: 0.8802
Recall: 0.9636
F-Measure: 0.9200
MAP: 0.9746
AUC:  0.9601
FCP:  0.8971

### Top 10 10-Fold
RMSE: 1.6521
MAE: 1.2136
Precision: 0.8822
Recall: 0.9640
F-Measure: 0.9213
MAP: 0.9747
AUC:  0.9595
FCP:  0.8961

RMSE: 1.6571
MAE: 1.2222
Precision: 0.8795
Recall: 0.9654
F-Measure: 0.9204
MAP: 0.9749
AUC:  0.9596
FCP:  0.8966

### Top 20 10-Fold
RMSE: 1.6530
MAE: 1.2156
Precision: 0.8810
Recall: 0.9628
F-Measure: 0.9201
MAP: 0.9741
AUC:  0.9599
FCP:  0.8972




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

predicted_ratings = pred_df['Avg_Predicted_Rating']
original_ratings = pred_df['Overall rating']

# Compute the correlation Coefficient
corr = original_ratings.corr(predicted_ratings)
plt.annotate(f'Correlation: {corr:.3f}', xy=(0.5, 0.1), xycoords='axes fraction')

# Create a scatter plot
plt.scatter(original_ratings, predicted_ratings)
plt.title("Correlation Between the Original Ratings and DeepFM_SVD++_MCRS Predicted Ratings")
plt.xlabel("Original Ratings")
plt.ylabel("DeepFM_SVD++_MCRS Predicted Ratings")
plt.grid(True)

# You can add a diagonal line for reference (perfect correlation)
plt.plot([min(original_ratings), max(original_ratings)], [min(original_ratings), max(original_ratings)], color='red', linestyle='--')

# Show the plot
plt.show()