In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse import csr_matrix
from sklearn.metrics import ndcg_score

In [4]:
data = pd.read_json("Industrial_and_Scientific_5.json",lines = True)
print(data.columns)

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')


In [None]:
def split(value):
    # shuffled_group = value.sample(frac=1, random_state = 42)
    # shuffle the df around
    shuffled_group = value.sample(frac=1)
    # get size of 80% of the df
    train_size = round(len(value)*0.8)
    # use the size to get last size elements and remaining 
    train = shuffled_group.iloc[:train_size]
    test = shuffled_group.iloc[train_size:]
    return train, test


train_list = []
test_list = []

# group each of the users and split them into training and testing
for i, j in data.groupby('reviewerID'):
    result = split(j)
    train_list.append(result[0])
    test_list.append(result[1])

training = pd.concat(train_list)
testing = pd.concat(test_list)

print(f'(test:{testing.shape}\n training:{training.shape})')



(test:(14764, 12)
 training:(62307, 12))


In [6]:
# prepare the utility matrix
def create_utility_matrix(df):
    return df.pivot(index='reviewerID', columns='asin', values='overall')

# calculate item-item similarity matrix
def compute_item_similarity(utility_matrix):
    item_similarity = cosine_similarity(utility_matrix.fillna(0).T)
    return pd.DataFrame(item_similarity, index=utility_matrix.columns, columns=utility_matrix.columns)

# predict ratings using item-item collaborative filtering
def predict_rating(user, item, utility_matrix, similarity_matrix, k=5):
    # handle missing items, default to global mean
    if item not in similarity_matrix.columns:
        return utility_matrix.mean().mean()  
    
    # get k most similar items
    similar_items = similarity_matrix[item].drop(item, errors='ignore').nlargest(k)
    user_ratings = utility_matrix.loc[user, similar_items.index].dropna()

    # handle cases where the user hasn't rated similar items
    if user_ratings.empty:
        return utility_matrix[item].mean() if item in utility_matrix else utility_matrix.mean().mean()
    
    # Weighted average prediction
    weighted_sum = (similar_items[user_ratings.index] * user_ratings).sum()
    normalization = similar_items[user_ratings.index].sum()
    return weighted_sum / normalization


def predict_test_data(train_data, test_data):
    train_data = train_data.groupby(['reviewerID', 'asin'], as_index=False).agg({'overall': 'mean'})
    # create utility matrix
    utility_matrix = create_utility_matrix(train_data)
    
    # compute similarity matrix
    similarity_matrix = compute_item_similarity(utility_matrix)

    # predict ratings for the test data
    predictions = []
    for i, row in test_data.iterrows():
        user, item = row['reviewerID'], row['asin']
        if user in utility_matrix.index:
            pred = predict_rating(user, item, utility_matrix, similarity_matrix)
        else:
    # global mean for unknown users
            pred = train_data['overall'].mean()  
        predictions.append(pred)

    return predictions

def evaluate_predictions(test_data, predictions):
    test_ratings = test_data['overall']
    rmse = np.sqrt(mean_squared_error(test_ratings, predictions))
    mae = mean_absolute_error(test_ratings, predictions)
    return rmse, mae

In [7]:
predictions = predict_test_data(training, testing)
print(predictions)

[4.446428571428571, 4.777777777777778, 5.0, 4.842105263157895, 3.9411764705882355, 4.5, 5.0, 5.0, 4.866666666666666, 4.571428571428571, 4.617647058823529, 4.625, 4.9411764705882355, 4.632, 4.717391304347826, 4.785714285714286, 4.6, 4.675, 4.180722891566265, 5.0, 5.0, 4.666666666666667, 5.0, 4.571428571428571, 4.4021739130434785, 4.734177215189874, 4.5, 4.25, 4.642424242424243, 4.75, 4.146341463414634, 4.583333333333333, 4.529411764705882, 4.3061224489795915, 4.666666666666667, 3.7, 4.717391304347826, 4.666666666666667, 5.0, 5.0, 4.928571428571429, 3.7777777777777777, 4.785714285714286, 5.0, 4.25, 4.75, 4.5, 4.631578947368421, 4.615384615384615, 4.75, 4.727891156462585, 4.695652173913044, 4.163934426229508, 4.529411764705882, 4.7631578947368425, 4.133333333333334, 4.8, 5.0, 4.615384615384615, 4.261904761904762, 4.666666666666667, 5.0, 4.45, 4.25, 4.428571428571429, 4.6, 4.666666666666667, 4.794444444444444, 4.6, 5.0, 5.0, 4.6982248520710055, 5.0, 4.25, 4.636363636363637, 5.0, 4.82608695

In [9]:
rmse, mae = evaluate_predictions(testing, predictions)
print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")

RMSE: 0.9146, MAE: 0.5796


In [35]:
# create a user-item matrix
def create(data):
    item_map = {item: idx for idx, item in enumerate(data['asin'].unique())}
    user_map = {user: idx for idx, user in enumerate(data['reviewerID'].unique())}
    
    item = data['asin'].map(item_map)
    user = data['reviewerID'].map(user_map)
    
    sparse_matrix = csr_matrix(
        (data['overall'], (user, item)),
        shape=(len(user_map), len(item_map))
    )
    
    # convert to float
    sparse_matrix = sparse_matrix.astype(np.float32)
    
    return sparse_matrix, user_map, item_map

sparse_user_item_matrix, user_map, item_map = create(training)

def SVD(sparse_matrix, k=250):
    dense_matrix = sparse_matrix.toarray()  
    # perform full SVD
    U, Sigma_full, Vt = np.linalg.svd(dense_matrix, full_matrices=False) 
    # keep only top-k  values 
    Sigma = np.diag(Sigma_full[:k])  
    # truncate U
    U = U[:, :k]
    # truncate Vt
    Vt = Vt[:k, :]  
    return U, Sigma, Vt

U, Sigma, Vt = SVD(sparse_user_item_matrix, k=250)

# predict ratings for a specific user
def predict(user_id, user_map, item_map, sparse_matrix, U, Sigma, Vt, top_n=10):

    user_idx = user_map[user_id]
    user_ratings = np.dot(np.dot(U[user_idx, :], Sigma), Vt)
    
    # items the user has already rated
    user_rated_items = sparse_matrix[user_idx].nonzero()[1]
    
    # ignore already rated items
    user_ratings[user_rated_items] = -np.inf  
    
    # top-n items
    top_item_indices = np.argsort(user_ratings)[::-1][:top_n]
    reverse_item_mapping = {idx: item for item, idx in item_map.items()}
    return [reverse_item_mapping[idx] for idx in top_item_indices]

# recommendations for all users
def recommendations(sparse_matrix, user_map, item_map, U, Sigma, Vt, top_n=10):    
    reverse_user_mapping = {idx: user for user, idx in user_map.items()}
    rec = {}
    
    for i, uid in enumerate(reverse_user_mapping.values()):
        recs = predict(uid, user_map, item_map, sparse_matrix, U, Sigma, Vt, top_n)
        rec[uid] = recs
    
    return rec


In [None]:
# generate recommendations for all users
rec = recommendations(sparse_user_item_matrix, user_map, item_map, U, Sigma, Vt, top_n=10 )

In [37]:
# convert to df
recommendations_df = pd.DataFrame([
    {"reviewerID": user, "recommendations": recs}
    for user, recs in rec.items()
])

print(recommendations_df)

                 reviewerID                                    recommendations
0      A0096681Y127OL1H8W3U  [B001E5ZWT4, B0015E2FDK, B002WJHE7E, B004Y960M...
1      A0196552RI15HI7JB9PW  [B00IP87H5U, B0013RVDG4, B005W42SW2, B00191IZB...
2      A0289048PRWFY7ZXQKCD  [B0061OT1A4, B0013TVEWA, B0014DEXNM, B00DXPQIF...
3      A0455940O5EUXQDU46QL  [B00YQB85PG, B00YSOZFQI, B00YQBF0ZY, B00YQBBZ8...
4      A07936821FOVJO6NP4Q8  [B00E8JPEMS, B004QXUR46, B00E8JN1DC, B000BODTK...
...                     ...                                                ...
11036         AZXS6P5QWNMLC  [B0019QXACO, B00O47ILVA, B0013AX6FC, B00002NC3...
11037         AZY0M1ANDSEPL  [B00ME7424W, B00ME7A1II, B00ME74ZJY, B00MAZFQ7...
11038         AZYIBG912W011  [B00002NC3K, B000BODTKI, B00137PW6Q, B00137PA9...
11039         AZYPAWSYSCISH  [B0013CQ6DY, B00004WCCP, B00AMGUZ70, B0013CCCA...
11040         AZZGB87A5N9QI  [B00MEZEEJ2, B00AA5AICU, B00YQB85PG, B00ZAUQZT...

[11041 rows x 2 columns]


In [None]:
def calculate_metrics(test_data, recommendations, top_n=10):
    precision_list = []
    recall_list = []
    ndcg_list = []
    
    # group test data by user
    test_data_grouped = test_data.groupby('reviewerID')['asin'].apply(set).to_dict()

    for user, recommended_items in recommendations.items():
        # get the actual items from the test set for user
        actual_items = test_data_grouped.get(user, set())
        if not actual_items: 
            continue
        # convert recommended items to a set for intersection calculation
        recommended_set = set(recommended_items[:top_n])
        
        # calculate precision and recall
        relevant_items = recommended_set.intersection(actual_items)
        precision = len(relevant_items) / top_n
        recall = len(relevant_items) / len(actual_items)
        
        # calculate NDCG
        relevance = [1 if item in actual_items else 0 for item in recommended_items[:top_n]]
        ndcg = ndcg_score([relevance], [list(range(len(relevance), 0, -1))])
        
        #append to list
        precision_list.append(precision)
        recall_list.append(recall)
        ndcg_list.append(ndcg)
    
    # calculate F-measure
    f_measure = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    
    avg_precision = np.mean(precision_list) if precision_list else 0
    avg_recall = np.mean(recall_list) if recall_list else 0
    avg_ndcg = np.mean(ndcg_list) if ndcg_list else 0
    
    return {
        "Precision": avg_precision,
        "Recall": avg_recall,
        "F-measure": f_measure,
        "NDCG": avg_ndcg
    }

metrics = calculate_metrics(testing, rec, top_n=10)

print("Evaluation Metrics:")
print(f"Precision: {metrics['Precision']:.3f}")
print(f"Recall: {metrics['Recall']:.3f}")
print(f"F-measure: {metrics['F-measure']:.3f}")
print(f"NDCG: {metrics['NDCG']:.3f}")


Evaluation Metrics:
Precision: 0.010
Recall: 0.074
F-measure: 0.017
NDCG: 0.067
