In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from typing import List

def preprocess(dataset_name: str = 'Dataset.csv') -> pd.DataFrame: 
    # Load the dataset (excluding Description and InvoiceDate columns)
    df = pd.read_csv(dataset_name, usecols=['CustomerID', 'StockCode', 'Quantity'])

    # Preprocess the data (drop NaN rows and convert to integer)
    df.dropna(subset=['CustomerID', 'StockCode', 'Quantity'], inplace=True)
    # drop StockCode with non-numeric values
    df = df[df['StockCode'].str.isnumeric()]

    df = df.astype({'CustomerID': int, 'StockCode': int, 'Quantity': int})
    return df


def split_df(df: pd.DataFrame, test_size: float = 0.2) -> (pd.DataFrame, pd.DataFrame):
    users = df['CustomerID'].unique()
    test_len = test_size*(len(users))
    test_customers = users[:int(test_len)]
    train_customers = users[int(test_len):]

    # make the train df have unique customers from test df to avoid data leakage
    df_train = df[df['CustomerID'].isin(train_customers)]
    
    df_test = df[df['CustomerID'].isin(test_customers)]
    
    # Create  Surprise Dataset
    reader = Reader(rating_scale=(1, 5))
    trainset = Dataset.load_from_df(df_train[['CustomerID', 'StockCode', 'Quantity']], reader)
    testset = Dataset.load_from_df(df_test[['CustomerID', 'StockCode', 'Quantity']], reader)

    # the folowing 2 lines were just work arounds in order to make itterable objects suitable for the SVD fit method
    trainset, _ = train_test_split(trainset, test_size=0.01, random_state=42)
    _, testset = train_test_split(testset, test_size=0.09, random_state=42)

    return trainset, testset

# Function to get item recommendations for a given user
def get_item_recommendations(user_id:int, dataset_path:str, model, num_recommendations=5)->List[int]:
    # Load the dataset
    df =preprocess(dataset_path)
    items_already_purchased = df[df['CustomerID'] == user_id]['StockCode'].tolist()
    
    # Get all item IDs
    item_ids = df['StockCode'].unique()
    
    # Remove items already purchased
    items_to_predict = [item_id for item_id in item_ids if item_id not in items_already_purchased]
    
    # Predict ratings for items
    predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
    
    # Sort predictions by predicted rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get top N recommended items
    top_recommendations = [prediction.iid for prediction in predictions[:num_recommendations]]
    return top_recommendations

df = preprocess('Dataset.csv')
trainset, testset = split_df(df)

# Train the model
model = SVD(n_factors=50, random_state=42)
model.fit(trainset)

# Make predictions
predictions = model.test(testset)

# Evaluate the model
accuracy.rmse(predictions)



RMSE: 44.1370


44.13696102551999

In [12]:
from surprise.accuracy import mae
from surprise.model_selection import cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate MAE using Surprise's built-in function
mae_score = mae(predictions)

# Convert predictions to binary recommendations (1 for predicted ratings >= 3, 0 otherwise)

# the est is the estimate of the matrix that came from the SVD
binary_predictions = [1 if prediction.est >= 3 else 0 for prediction in predictions]
# the r_ui is the actual rating
binary_actuals = [1 if prediction.r_ui >= 3 else 0 for prediction in predictions]

# Calculate Precision, Recall, and F1-Score using scikit-learn
precision = precision_score(binary_actuals, binary_predictions)
recall = recall_score(binary_actuals, binary_predictions)
f1 = f1_score(binary_actuals, binary_predictions)

print("MAE:", mae_score)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


MAE:  11.9272
MAE: 11.927197700980031
Precision: 0.6513153046938325
Recall: 1.0
F1-Score: 0.7888442659526997


In [3]:


# Get item recommendations for a specific user
user_id = 17850  # Replace with a valid CustomerID
recommendations = get_item_recommendations(user_id,'Dataset.csv',model)
print("Item recommendations for User ID:", user_id)
print(recommendations)

Item recommendations for User ID: 17850
[84879, 22745, 22748, 22749, 22310]


In [8]:
from surprise import dump

# Assuming you have trained the model as 'model'
model_filename = 'trained_model.pkl'  # Specify the desired file name for the model

# Save the model to the current working directory
dump.dump(model_filename, algo=model)

In [10]:
loaded_model = dump.load('trained_model.pkl')[1]
recommendations = get_item_recommendations(user_id,'Dataset.csv',loaded_model)
print(recommendations)

[84879, 22745, 22748, 22749, 22310]
