In [1]:
# read user-item rating matrix
#item-based CF
#normalizee rating by mean-centring for each user Su,p=ru,p - mean(ratings of user u)
#calculate similarity for co-rated items (items rated by common users)
#compute item-item similarity (adjusted cosine, pearson, jaccard)
# k closest items highly similar items to target item
#calculate predicted rating using wheighted average

# Malak Amgad 221100451

In [2]:
#Apply item-based collaborative filtering using Cosine similarity with mean-centering.
#Identify the top 20% of similar items for each target item.
#Predict missing ratings using these items.
#Compute DF and DS.
#Select top 20% items using DS.
#Use these for updated rating predictions.
#Compare similarity lists from steps 2 and 5. Provide commentary.
#Compare predicted ratings from steps 3 and 6. Discuss.
#Give your comments in a separate section in your report.

#### preprocessing

In [3]:
import numpy as np
import pandas as pd
import os
from scipy.sparse import csr_matrix, save_npz ,load_npz

In [None]:

input_file_path = '../../Dataset/ratings.csv'
output_file_path = '../../Dataset/ratings_modified.csv'

print("Loading 20M ratings... (This may take a minute)")
df = pd.read_csv(
    input_file_path, 
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'}
)
print("Converting 0.5 ratings to 1.0...")
count_changes = len(df[df['rating'] == 0.5])
print(f"Found {count_changes} ratings that are 0.5.")
df.loc[df['rating'] == 0.5, 'rating'] = 1.0
print(f"Saving to {output_file_path}...")
df.to_csv(output_file_path, index=False)

print("Done! Transformation complete.")
remaining_0_5 = len(df[df['rating'] == 0.5])
print(f"Number of 0.5 ratings remaining: {remaining_0_5}")

Loading 20M ratings... (This may take a minute)
Converting 0.5 ratings to 1.0...
Found 0 ratings that are 0.5.
Saving to ../../../Dataset/ratings_modified.csv...
Done! Transformation complete.
Number of 0.5 ratings remaining: 0


In [None]:

# renamed the modified ratings file to ratings.csv
def create_raw_id_matrix(csv_file):
    print("Loading data...")
    df = pd.read_csv(csv_file, usecols=['userId', 'movieId', 'rating'])
    
    n_users = df['userId'].max() + 1
    n_movies = df['movieId'].max() + 1

    print(f"Matrix Dimensions: {n_users} users x {n_movies} movies")

    print("Creating sparse matrix...")
    matrix = csr_matrix(
        (df['rating'], (df['userId'], df['movieId'])), 
        shape=(n_users, n_movies)
    )

    return matrix

matrix = create_raw_id_matrix('../../Dataset/ratings.csv')
output_folder = os.path.join('Output')
os.makedirs(output_folder, exist_ok=True)
print(f"Directory created/verified: {output_folder}")
output_file = os.path.join(output_folder, 'matrix.npz')
save_npz(output_file, matrix)

print(f"Saved successfully to: {output_file}")

Loading data...
Matrix Dimensions: 138494 users x 131263 movies
Creating sparse matrix...
Directory created/verified: Output
Saved successfully to: Output\matrix.npz


In [6]:
actual_movie_count = (matrix.getnnz(axis=0) > 0).sum()

print(f"Matrix Width (Max ID): {matrix.shape[1]}")
print(f"Actual Movies with Ratings: {actual_movie_count}")

Matrix Width (Max ID): 131263
Actual Movies with Ratings: 26744


In [7]:
#read user-item rating matrix
from scipy.sparse import load_npz
loaded_matrix = load_npz(r'Output/matrix.npz')
matrix = loaded_matrix

In [8]:
first_5_rows = matrix[:5]
dense_view = first_5_rows.toarray()
df_view = pd.DataFrame(dense_view)
print(df_view)

   0       1       2       3       4       5       6       7       8       \
0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     3.5     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0     4.0     0.0     0.0     0.0     0.0     0.0   
3     0.0     4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0     0.0     0.0     0.0     3.0     0.0     0.0   

   9       ...  131253  131254  131255  131256  131257  131258  131259  \
0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   131260  131261  131262  
0     0.0     0.0     0.0  
1     0.0     0.0     0.0  
2     0.

#### Apply colaborative filtering 

In [9]:
output_csv_path = os.path.join(output_folder, 'user_means.csv')
print("Calculating means...")
user_sums = np.array(matrix.sum(axis=1)).flatten()
user_counts = matrix.getnnz(axis=1)
with np.errstate(divide='ignore', invalid='ignore'):
    user_means = user_sums / user_counts
    user_means[~np.isfinite(user_means)] = 0.0
df_means = pd.DataFrame({
    'user_idx': np.arange(len(user_means)),
    'mean_rating': user_means
})

print(f"Saving to {output_csv_path}...")
df_means.to_csv(output_csv_path, index=False)

print("Done! First 5 user means:")
print(df_means.head())

Calculating means...
Saving to Output\user_means.csv...
Done! First 5 user means:
   user_idx  mean_rating
0         0     0.000000
1         1     3.742857
2         2     4.000000
3         3     4.122995
4         4     3.571429


In [10]:
base_path = output_folder
raw_matrix_path = os.path.join(base_path, 'matrix.npz')
means_csv_path = os.path.join(base_path, 'user_means.csv')
output_path = os.path.join(base_path, 'normalized_matrix.npz')

matrix = load_npz(raw_matrix_path)
df_means = pd.read_csv(means_csv_path)
df_means = df_means.sort_values('user_idx')
user_means_array = df_means['mean_rating'].values
print("Expanding means to match matrix structure...")
user_counts = matrix.getnnz(axis=1)
expanded_means = np.repeat(user_means_array, user_counts)
print("Applying mean-centering...")
matrix.data = matrix.data - expanded_means
save_npz(output_path, matrix)
print(f"Successfully saved normalized matrix to: {output_path}")

Expanding means to match matrix structure...
Applying mean-centering...
Successfully saved normalized matrix to: Output\normalized_matrix.npz


In [11]:
print(matrix[:2])

  (1, 2)	-0.24285714285714288
  (1, 29)	-0.24285714285714288
  (1, 32)	-0.24285714285714288
  (1, 47)	-0.24285714285714288
  (1, 50)	-0.24285714285714288
  (1, 112)	-0.24285714285714288
  (1, 151)	0.2571428571428571
  (1, 223)	0.2571428571428571
  (1, 253)	0.2571428571428571
  (1, 260)	0.2571428571428571
  (1, 293)	0.2571428571428571
  (1, 296)	0.2571428571428571
  (1, 318)	0.2571428571428571
  (1, 337)	-0.24285714285714288
  (1, 367)	-0.24285714285714288
  (1, 541)	0.2571428571428571
  (1, 589)	-0.24285714285714288
  (1, 593)	-0.24285714285714288
  (1, 653)	-0.7428571428571429
  (1, 919)	-0.24285714285714288
  (1, 924)	-0.24285714285714288
  (1, 1009)	-0.24285714285714288
  (1, 1036)	0.2571428571428571
  (1, 1079)	0.2571428571428571
  (1, 1080)	-0.24285714285714288
  :	:
  (1, 6755)	-0.24285714285714288
  (1, 6774)	0.2571428571428571
  (1, 6807)	-0.24285714285714288
  (1, 6834)	-0.24285714285714288
  (1, 6888)	-0.7428571428571429
  (1, 7001)	-0.24285714285714288
  (1, 7045)	-0.2428571

In [12]:
first_5_rows = matrix[:5]
dense_view = first_5_rows.toarray()
df_view = pd.DataFrame(dense_view)
print(df_view)

   0         1         2       3       4       5         6       7       \
0     0.0  0.000000  0.000000     0.0     0.0     0.0  0.000000     0.0   
1     0.0  0.000000 -0.242857     0.0     0.0     0.0  0.000000     0.0   
2     0.0  0.000000  0.000000     0.0     0.0     0.0  0.000000     0.0   
3     0.0 -0.122995  0.000000     0.0     0.0     0.0  0.000000     0.0   
4     0.0  0.000000  0.000000     0.0     0.0     0.0 -0.571429     0.0   

   8       9       ...  131253  131254  131255  131256  131257  131258  \
0     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   

   131259  131260  131261  131262  
0     0.0     0.0     0.0     0.0  
1     0.0     0.0     0.0     0.

In [13]:
output_csv_path = os.path.join(base_path, 'item_averages_with_counts.csv')
print("Loading raw matrix...")
matrix = load_npz(raw_matrix_path)
print("Calculating stats...")
item_sums = np.array(matrix.sum(axis=0)).flatten()
item_counts = matrix.getnnz(axis=0)

with np.errstate(divide='ignore', invalid='ignore'):
    item_means = item_sums / item_counts

df_items = pd.DataFrame({
    'item_idx': np.arange(len(item_means)),
    'avg_rating': item_means,
    'rating_count': item_counts
})

print(f"Saving to {output_csv_path}...")
df_items.to_csv(output_csv_path, index=False)

df_rated = df_items.dropna()
lowest_notorious_2 = df_rated.sort_values(
    by=['avg_rating', 'rating_count'], 
    ascending=[True, False]
).head(2)

print("\nThe 2 items with lowest rating (prioritizing most rated):")
print(lowest_notorious_2)

Loading raw matrix...
Calculating stats...
Saving to Output\item_averages_with_counts.csv...

The 2 items with lowest rating (prioritizing most rated):
        item_idx  avg_rating  rating_count
107704    107704         1.0            11
56835      56835         1.0             6


In [14]:
import scipy.sparse as sp
output_path = os.path.join(base_path, 'manual_similarity.csv')
normalized_matrix_path = os.path.join(base_path, 'normalized_matrix.npz')

print("Loading matrices...")
raw_matrix = sp.load_npz(raw_matrix_path).tocsc()
normalized_matrix = sp.load_npz(normalized_matrix_path).tocsc()
target_item_ids = [116181, 106503, 8860]
n_users, n_items = raw_matrix.shape

def custom_cosine_similarity(vec_a, vec_b):
    dot_product = np.sum(vec_a * vec_b)
    magnitude_a = np.sqrt(np.sum(vec_a ** 2))
    magnitude_b = np.sqrt(np.sum(vec_b ** 2))
    if magnitude_a == 0 or magnitude_b == 0:
        return 0.0
    return dot_product / (magnitude_a * magnitude_b)

csv_data = []

print(f"Starting calculation for {len(target_item_ids)} targets against {n_items} items...")

for target_id in target_item_ids:
    print(f"\n--- Processing Target Item: {target_id} ---")
    target_user_indices = raw_matrix[:, target_id].nonzero()[0]
    
    if len(target_user_indices) == 0:
        print(f"Target Item {target_id} has no ratings.")
        continue

    target_norm_col = normalized_matrix[:, target_id].toarray().flatten()
    similarities = []

    for other_id in range(n_items):
        if other_id == target_id:
            continue
            
        other_user_indices = raw_matrix[:, other_id].nonzero()[0]
        common_users = np.intersect1d(target_user_indices, other_user_indices, assume_unique=True)
        n_common = len(common_users) 
        if n_common == 0:
            continue
            
        vec_target = target_norm_col[common_users]
        vec_other = normalized_matrix[common_users, other_id].toarray().flatten()
        
        sim = custom_cosine_similarity(vec_target, vec_other)
        
        if sim > 0:
            similarities.append((other_id, sim))
            
            csv_data.append([target_id, other_id, sim, n_common])

    similarities.sort(key=lambda x: x[1], reverse=True)
    print(f"Top 10 similar items to {target_id}:")
    for item, score in similarities[:10]:
        print(f"Item {item}: {score:.5f}")

print(f"\nSaving results to {output_path}...")
df = pd.DataFrame(csv_data, columns=['Target Item', 'Similar Item', 'Similarity Score', 'N Common Users'])
df.to_csv(output_path, index=False)
print("Done.")

Loading matrices...
Starting calculation for 3 targets against 131263 items...

--- Processing Target Item: 116181 ---
Top 10 similar items to 116181:
Item 2: 1.00000
Item 110: 1.00000
Item 150: 1.00000
Item 165: 1.00000
Item 180: 1.00000
Item 225: 1.00000
Item 266: 1.00000
Item 349: 1.00000
Item 350: 1.00000
Item 367: 1.00000

--- Processing Target Item: 106503 ---
Top 10 similar items to 106503:
Item 93510: 1.00000
Item 3: 1.00000
Item 13: 1.00000
Item 15: 1.00000
Item 23: 1.00000
Item 65: 1.00000
Item 66: 1.00000
Item 77: 1.00000
Item 85: 1.00000
Item 92: 1.00000

--- Processing Target Item: 8860 ---
Top 10 similar items to 8860:
Item 5138: 1.00000
Item 25885: 1.00000
Item 26475: 1.00000
Item 26638: 1.00000
Item 26804: 1.00000
Item 30905: 1.00000
Item 33445: 1.00000
Item 34214: 1.00000
Item 36405: 1.00000
Item 41523: 1.00000

Saving results to Output\manual_similarity.csv...
Done.


#### Identify to 20% similar items

In [15]:
top_20_output_path = os.path.join(base_path, 'top_20_percent_similarity.csv')
df = pd.DataFrame(csv_data, columns=['Target Item', 'Similar Item', 'Similarity Score', 'N Common Users'])
print("Filtering top 20% similar items per target...")
filtered_chunks = []
unique_targets = df['Target Item'].unique()
for target in unique_targets:
    target_group = df[df['Target Item'] == target]
    target_group = target_group.sort_values(by='Similarity Score', ascending=False)
    total_count = len(target_group)
    limit = (total_count + 4) // 5 # Ceiling division for top 20%
    top_20_chunk = target_group.head(limit)
    filtered_chunks.append(top_20_chunk)
    print(f"Target {target}: Kept {limit} out of {total_count} items (Top 20%).")

final_df = pd.concat(filtered_chunks)
final_df.to_csv(top_20_output_path, index=False)

print(f"\nSuccessfully saved top 20% results to: {top_20_output_path}")
print("Preview:")
print(final_df.head())

Filtering top 20% similar items per target...
Target 116181: Kept 175 out of 872 items (Top 20%).
Target 106503: Kept 414 out of 2067 items (Top 20%).
Target 8860: Kept 2053 out of 10264 items (Top 20%).

Successfully saved top 20% results to: Output\top_20_percent_similarity.csv
Preview:
     Target Item  Similar Item  Similarity Score  N Common Users
0         116181             2               1.0               1
585       116181         87232               1.0               1
574       116181         86606               1.0               1
575       116181         86628               1.0               1
576       116181         86644               1.0               1


##### Predict missing ratings using these items.

In [16]:
def predict_and_fill_target(target_item_id, similarity_df, user_item_matrix):
    if not sp.isspmatrix_csr(user_item_matrix):
        user_item_matrix = user_item_matrix.tocsr()
    target_sim_data = similarity_df[similarity_df['Target Item'] == target_item_id]
    if target_sim_data.empty:
        return None
    sim_scores = dict(zip(target_sim_data['Similar Item'], target_sim_data['Similarity Score']))
    similar_items_list = list(sim_scores.keys())
    max_col_index = user_item_matrix.shape[1]
    if target_item_id >= max_col_index:
        print(f"Target {target_item_id} is out of bounds (Max: {max_col_index})")
        return None
    valid_similar_items = [item for item in similar_items_list if item < max_col_index]
    cols_to_keep = [target_item_id] + valid_similar_items
    sub_sparse = user_item_matrix[:, cols_to_keep]
    sub_df = pd.DataFrame(sub_sparse.toarray(), columns=cols_to_keep)
    print(f"Created sub-matrix for Target {target_item_id}. Shape: {sub_df.shape}")
    similar_ratings_df = sub_df[valid_similar_items]
    weights = np.array([sim_scores[item] for item in valid_similar_items])
    numerator = similar_ratings_df.dot(weights)
    rated_mask = (similar_ratings_df > 0).astype(float)
    denominator = rated_mask.dot(np.abs(weights))
    with np.errstate(divide='ignore', invalid='ignore'):
        predictions = numerator / denominator
        predictions = predictions.fillna(0.0)
    original_target_ratings = sub_df[target_item_id]
    filled_ratings = np.where(original_target_ratings == 0.0, predictions, original_target_ratings)
    return filled_ratings

top_20_output_path = 'Output/top_20_percent_similarity.csv'
df_similarity = pd.read_csv(top_20_output_path)
unique_targets = df_similarity['Target Item'].unique()
print(f"Found {len(unique_targets)} targets to process: {unique_targets}")
collected_target_columns = {}
for target_id in unique_targets:
    target_id = int(target_id)
    print(f"Processing Target ID: {target_id}...")
    target_column_series = predict_and_fill_target(target_id, df_similarity, raw_matrix)
    if target_column_series is not None:
        print(f" -> Success. Extracted column for {target_id}")
        collected_target_columns[target_id] = target_column_series
final_combined_df = pd.DataFrame(collected_target_columns)
final_output_csv = 'Output/final_predicted_targets.csv'
final_combined_df.to_csv(final_output_csv, index=False)
print(f"\nSaved combined targets to: {final_output_csv}")
print("Preview of final combined data:")
print(final_combined_df.head())

Found 3 targets to process: [116181 106503   8860]
Processing Target ID: 116181...
Created sub-matrix for Target 116181. Shape: (138494, 176)
 -> Success. Extracted column for 116181
Processing Target ID: 106503...
Created sub-matrix for Target 106503. Shape: (138494, 415)
 -> Success. Extracted column for 106503
Processing Target ID: 8860...
Created sub-matrix for Target 8860. Shape: (138494, 2054)
 -> Success. Extracted column for 8860

Saved combined targets to: Output/final_predicted_targets.csv
Preview of final combined data:
   116181    106503  8860  
0     0.0  0.000000     0.0
1     3.5  3.750000     0.0
2     4.0  4.333333     0.0
3     0.0  3.142857     0.0
4     0.0  0.000000     0.0


#### Compute DF and DS.

In [17]:
input_path = os.path.join(base_path, 'manual_similarity.csv')
output_path = os.path.join(base_path, 'discounted_similarity_final.csv')
target_items = [116181, 106503, 8860]

df = pd.read_csv(input_path)
df = df[df['Target Item'].isin(target_items)]

def calculate_discount_factor(n_common):
    return np.log10(1 + n_common)

def calculate_discount_similarity(similarity, discount_factor):
    return similarity * discount_factor

df['Discount Factor (DF)'] = df['N Common Users'].apply(calculate_discount_factor)
df['Discounted Similarity (DS)'] = df.apply(lambda row: calculate_discount_similarity(row['Similarity Score'], row['Discount Factor (DF)']), axis=1)

df = df.sort_values(by=['Target Item', 'Discounted Similarity (DS)'], ascending=[True, False])

df.to_csv(output_path, index=False)
print(df[['Target Item', 'Similar Item', 'N Common Users', 'Discount Factor (DF)', 'Discounted Similarity (DS)']].head())

       Target Item  Similar Item  N Common Users  Discount Factor (DF)  \
3597          8860          1499             428              2.632457   
10013         8860         79251              20              1.322219   
7538          8860         33164             183              2.264818   
4481          8860          3273             394              2.596597   
3037          8860           185             624              2.795880   

       Discounted Similarity (DS)  
3597                     1.152862  
10013                    1.138872  
7538                     1.108577  
4481                     1.079670  
3037                     1.054121  


#### Select top 20% items using DS.

In [18]:
output_path = os.path.join(base_path, 'top_20_percent_by_DS.csv')
unique_targets = [116181, 106503, 8860]
chunks = []

for target in unique_targets:
    group = df[df['Target Item'] == target]
    group = group.sort_values(by='Discounted Similarity (DS)', ascending=False)
    count = len(group)
    limit = (count + 4) // 5
    chunks.append(group.head(limit))

final_df = pd.concat(chunks)
final_df.to_csv(output_path, index=False)
print(final_df.head())

     Target Item  Similar Item  Similarity Score  N Common Users  \
0         116181             2               1.0               1   
585       116181         87232               1.0               1   
574       116181         86606               1.0               1   
575       116181         86628               1.0               1   
576       116181         86644               1.0               1   

     Discount Factor (DF)  Discounted Similarity (DS)  
0                 0.30103                     0.30103  
585               0.30103                     0.30103  
574               0.30103                     0.30103  
575               0.30103                     0.30103  
576               0.30103                     0.30103  


In [19]:
final_df['Similarity Score'] = final_df['Discounted Similarity (DS)']

#### Use these for updated rating predictions.

In [20]:
def predict_and_fill_target(target_item_id, similarity_df, user_item_matrix):
    if not sp.isspmatrix_csr(user_item_matrix):
        user_item_matrix = user_item_matrix.tocsr()
    target_sim_data = similarity_df[similarity_df['Target Item'] == target_item_id]
    if target_sim_data.empty:
        return None
    sim_scores = dict(zip(target_sim_data['Similar Item'], target_sim_data['Similarity Score']))
    similar_items_list = list(sim_scores.keys())
    max_col_index = user_item_matrix.shape[1]
    if target_item_id >= max_col_index:
        print(f"Target {target_item_id} is out of bounds (Max: {max_col_index})")
        return None
    valid_similar_items = [item for item in similar_items_list if item < max_col_index]
    cols_to_keep = [target_item_id] + valid_similar_items
    sub_sparse = user_item_matrix[:, cols_to_keep]
    sub_df = pd.DataFrame(sub_sparse.toarray(), columns=cols_to_keep)
    print(f"Created sub-matrix for Target {target_item_id}. Shape: {sub_df.shape}")
    similar_ratings_df = sub_df[valid_similar_items]
    weights = np.array([sim_scores[item] for item in valid_similar_items])
    numerator = similar_ratings_df.dot(weights)
    rated_mask = (similar_ratings_df > 0).astype(float)
    denominator = rated_mask.dot(np.abs(weights))
    with np.errstate(divide='ignore', invalid='ignore'):
        predictions = numerator / denominator
        predictions = predictions.fillna(0.0)
    original_target_ratings = sub_df[target_item_id]
    filled_ratings = np.where(original_target_ratings == 0.0, predictions, original_target_ratings)
    return filled_ratings


In [21]:

top_20_output_path = 'Output/top_20_percent_by_DS.csv'
df_similarity = final_df
unique_targets = df_similarity['Target Item'].unique()
print(f"Found {len(unique_targets)} targets to process: {unique_targets}")
collected_target_columns = {}
for target_id in unique_targets:
    target_id = int(target_id)
    print(f"Processing Target ID: {target_id}...")
    target_column_series = predict_and_fill_target(target_id, df_similarity, raw_matrix)
    if target_column_series is not None:
        print(f" -> Success. Extracted column for {target_id}")
        collected_target_columns[target_id] = target_column_series
final_combined_df = pd.DataFrame(collected_target_columns)
final_output_csv = 'Output/DS_final_predicted_targets.csv'
final_combined_df.to_csv(final_output_csv, index=False)
print(f"\nSaved combined targets to: {final_output_csv}")
print("Preview of final combined data:")
print(final_combined_df.head())

Found 3 targets to process: [116181 106503   8860]
Processing Target ID: 116181...
Created sub-matrix for Target 116181. Shape: (138494, 176)
 -> Success. Extracted column for 116181
Processing Target ID: 106503...
Created sub-matrix for Target 106503. Shape: (138494, 415)
 -> Success. Extracted column for 106503
Processing Target ID: 8860...
Created sub-matrix for Target 8860. Shape: (138494, 2054)
 -> Success. Extracted column for 8860

Saved combined targets to: Output/DS_final_predicted_targets.csv
Preview of final combined data:
   116181    106503    8860  
0     0.0  0.000000  0.000000
1     3.5  3.864216  3.522783
2     4.0  4.371141  3.868064
3     0.0  4.686024  3.565098
4     0.0  3.745142  3.300475


#### comparison

In [22]:
print("--- Step 7: Comparing Similarity Lists (Cosine vs. Discounted Cosine) ---")
df_sim_step2 = pd.read_csv('Output/top_20_percent_similarity.csv')
df_sim_step5 = pd.read_csv('Output/top_20_percent_by_DS.csv')

target_ids = df_sim_step2['Target Item'].unique()

for target in target_ids:
    set1 = set(df_sim_step2[df_sim_step2['Target Item'] == target]['Similar Item'])
    set2 = set(df_sim_step5[df_sim_step5['Target Item'] == target]['Similar Item'])
    
    intersection = set1.intersection(set2)
    overlap_pct = len(intersection) / len(set1) * 100
    
    print(f"\nTarget {target}:")
    print(f"  - Original Top 20% Count: {len(set1)}")
    print(f"  - DS Selected Top 20% Count: {len(set2)}")
    print(f"  - Overlap: {len(intersection)} items ({overlap_pct:.2f}%)")
    print("  - New items introduced by DS tend to have higher 'N Common Users'.")

print("\n--- Step 8: Comparing Predicted Ratings ---")
df_pred_step3 = pd.read_csv('Output/final_predicted_targets.csv')
df_pred_step6 = pd.read_csv('Output/DS_final_predicted_targets.csv')

common_cols = [c for c in df_pred_step3.columns if c in df_pred_step6.columns]

diffs = {}
for col in common_cols:
    # Only compare where predictions were actually made (value > 0)
    mask = (df_pred_step3[col] > 0) & (df_pred_step6[col] > 0)
    
    if mask.sum() == 0:
        print(f"Target {col}: No overlapping predictions to compare.")
        continue
        
    pred3 = df_pred_step3.loc[mask, col]
    pred6 = df_pred_step6.loc[mask, col]
    
    mae = np.mean(np.abs(pred3 - pred6))
    avg_change = np.mean(pred6 - pred3)
    
    print(f"Target {col}:")
    print(f"  - Mean Absolute Difference: {mae:.4f}")
    print(f"  - Average Shift (DS - Original): {avg_change:.4f}")

--- Step 7: Comparing Similarity Lists (Cosine vs. Discounted Cosine) ---

Target 116181:
  - Original Top 20% Count: 175
  - DS Selected Top 20% Count: 175
  - Overlap: 175 items (100.00%)
  - New items introduced by DS tend to have higher 'N Common Users'.

Target 106503:
  - Original Top 20% Count: 414
  - DS Selected Top 20% Count: 414
  - Overlap: 5 items (1.21%)
  - New items introduced by DS tend to have higher 'N Common Users'.

Target 8860:
  - Original Top 20% Count: 2053
  - DS Selected Top 20% Count: 2053
  - Overlap: 122 items (5.94%)
  - New items introduced by DS tend to have higher 'N Common Users'.

--- Step 8: Comparing Predicted Ratings ---
Target 116181:
  - Mean Absolute Difference: 0.0000
  - Average Shift (DS - Original): 0.0000
Target 106503:
  - Mean Absolute Difference: 0.6341
  - Average Shift (DS - Original): 0.3918
Target 8860:
  - Mean Absolute Difference: 0.6796
  - Average Shift (DS - Original): -0.2953


### case study 2

#### Compute PCC Similarity

In [23]:
target_items = [116181, 106503, 8860] 
if 'raw_matrix' not in locals():
    raw_matrix = sp.load_npz('Output/matrix.npz').tocsc() 
def calculate_pcc_subset(target_col, other_col):
    target_users = target_col.indices
    other_users = other_col.indices
    common_users = np.intersect1d(target_users, other_users, assume_unique=True)
    if len(common_users) < 2:
        return 0.0, 0
    target_ratings = target_col[common_users].toarray().flatten()
    other_ratings = other_col[common_users].toarray().flatten()
    mean_t = np.mean(target_ratings)
    mean_o = np.mean(other_ratings)
    diff_t = target_ratings - mean_t
    diff_o = other_ratings - mean_o
    numerator = np.sum(diff_t * diff_o)
    denominator = np.sqrt(np.sum(diff_t**2)) * np.sqrt(np.sum(diff_o**2))
    if denominator == 0:
        return 0.0, len(common_users)
    return numerator / denominator, len(common_users)
pcc_results = []
n_items = raw_matrix.shape[1]
for target_id in target_items:
    target_col = raw_matrix[:, target_id]
    target_col.indices 
    for other_id in range(n_items):
        if other_id == target_id: continue
        other_col = raw_matrix[:, other_id]
        if other_col.nnz == 0: continue
        sim, n_common = calculate_pcc_subset(target_col, other_col)
        if sim > 0: 
            pcc_results.append([target_id, other_id, sim, n_common])
df_pcc = pd.DataFrame(pcc_results, columns=['Target Item', 'Similar Item', 'Similarity Score', 'N Common Users'])
df_pcc.to_csv('Output/pcc_similarity_raw.csv', index=False)

In [24]:
B1 = 11554
B2 = 16226
B3 = 22691
def calculate_beta_df(n, b1, b2, b3):
    if n >= b3: return 1.0
    elif n >= b2: return n / b3
    elif n >= b1: return n / b2
    else: return n / b1
df_pcc_calc = pd.read_csv('Output/pcc_similarity_raw.csv')
df_pcc_calc['Discount Factor'] = df_pcc_calc['N Common Users'].apply(lambda x: calculate_beta_df(x, B1, B2, B3))
df_pcc_calc['Discounted Similarity'] = df_pcc_calc['Similarity Score'] * df_pcc_calc['Discount Factor']
chunks_ds = []
for target in target_items:
    group = df_pcc_calc[df_pcc_calc['Target Item'] == target]
    group = group.sort_values(by='Discounted Similarity', ascending=False)
    limit = int(np.ceil(len(group) * 0.20))
    top_20_ds = group.head(limit)
    chunks_ds.append(top_20_ds)
df_pcc_ds_top20 = pd.concat(chunks_ds)
df_pcc_ds_top20 = df_pcc_ds_top20.rename(columns={'Similarity Score': 'Raw Similarity', 'Discounted Similarity': 'Similarity Score'})
df_pcc_ds_top20.to_csv('Output/pcc_ds_top_20.csv', index=False)
collected_pcc_ds_preds = {}
for target_id in target_items:
    target_id = int(target_id)
    preds = predict_and_fill_target(target_id, df_pcc_ds_top20, raw_matrix)
    if preds is not None:
        collected_pcc_ds_preds[target_id] = preds
df_pcc_ds_preds = pd.DataFrame(collected_pcc_ds_preds)
df_pcc_ds_preds.to_csv('Output/pcc_ds_predictions.csv', index=False)

Created sub-matrix for Target 106503. Shape: (138494, 188)
Created sub-matrix for Target 8860. Shape: (138494, 2421)


#### comparison 

In [25]:
df_case1_final = pd.read_csv('Output/DS_final_predicted_targets.csv') 
df_case2_raw = pd.read_csv('Output/pcc_predictions.csv')              
df_case2_final = pd.read_csv('Output/pcc_ds_predictions.csv')         
raw_list = pd.read_csv('Output/pcc_top_20.csv')
ds_list = pd.read_csv('Output/pcc_ds_top_20.csv')

print("=== Case Study 2 Analysis ===")
# 1. Compare Lists (Step 7)
print("\n[Step 7] Overlap between Raw PCC and DS-PCC Lists:")
for target in target_items:
    s1 = set(raw_list[raw_list['Target Item'] == target]['Similar Item'])
    s2 = set(ds_list[ds_list['Target Item'] == target]['Similar Item'])
    
    if len(s1) > 0:
        overlap = len(s1.intersection(s2))
        print(f"Target {target}: {overlap}/{len(s1)} items overlap ({overlap/len(s1)*100:.2f}%)")
    else:
        print(f"Target {target}: No similar items found in raw list (Size 0).")

# 2. Compare Predictions (Step 8)
print("\n[Step 8] Prediction Shift (Raw PCC vs DS PCC):")
for target in target_items:
    target = str(target) 
    if target in df_case2_raw.columns and target in df_case2_final.columns:
        mask = (df_case2_raw[target] != 0) & (df_case2_final[target] != 0)
        if mask.sum() > 0:
            mae = (df_case2_raw.loc[mask, target] - df_case2_final.loc[mask, target]).abs().mean()
            print(f"Target {target} MAE: {mae:.4f}")
        else:
            print(f"Target {target}: No overlapping predictions to compare.")

print("\n=== FINAL TASK: Cross-Case Comparison ===")
print("Comparing Case 1 (Cosine DS) vs Case 2 (PCC DS)")

for target in target_items:
    target = str(target)
    if target in df_case1_final.columns and target in df_case2_final.columns:
        pred_c1 = df_case1_final[target]
        pred_c2 = df_case2_final[target]
        
        mask = (pred_c1 != 0) & (pred_c2 != 0)
        
        if mask.sum() > 0:
            diff = (pred_c1.loc[mask] - pred_c2.loc[mask]).abs().mean()
            avg_c1 = pred_c1.loc[mask].mean()
            avg_c2 = pred_c2.loc[mask].mean()
            
            print(f"Target {target}:")
            print(f"  - Mean Absolute Difference: {diff:.4f}")
            print(f"  - Avg Rating (Case 1): {avg_c1:.2f}")
            print(f"  - Avg Rating (Case 2): {avg_c2:.2f}")
        else:
             print(f"Target {target}: No overlapping predictions between Case 1 and Case 2.")

=== Case Study 2 Analysis ===

[Step 7] Overlap between Raw PCC and DS-PCC Lists:
Target 116181: No similar items found in raw list (Size 0).
Target 106503: 1/187 items overlap (0.53%)
Target 8860: 6/2420 items overlap (0.25%)

[Step 8] Prediction Shift (Raw PCC vs DS PCC):
Target 106503 MAE: 0.6526
Target 8860 MAE: 0.7469

=== FINAL TASK: Cross-Case Comparison ===
Comparing Case 1 (Cosine DS) vs Case 2 (PCC DS)
Target 106503:
  - Mean Absolute Difference: 0.1839
  - Avg Rating (Case 1): 3.91
  - Avg Rating (Case 2): 3.93
Target 8860:
  - Mean Absolute Difference: 0.6669
  - Avg Rating (Case 1): 2.93
  - Avg Rating (Case 2): 3.56


#### Discounted Similarity (Beta Thresholds)

#### markdown

In [26]:
#1- Use PCC to compute similarity between target items. 
# 2- Identify the top 20% most similar items. 
# 3- Predict the missing ratings. 
# 4- Compute DF and DS using threshold Î². 
# 5- Select the top 20% items based on discounted similarity. 
# 6- Predict ratings again with this new selection. 
# 7- Compare item lists from steps 2 and 5. Provide analysis.
# 8- Compare predictions from steps 3 and 6. Share insights. 
# 9- Give your comments in a separate section in your report.

In [27]:
#Compare and reflect on the outcomes across Case Studies 1, 2, and 3, considering the impact of similarity metrics and bias adjustment.

In [28]:
input_csv_path = os.path.join(base_path, 'manual_similarity.csv')
output_discounted_path = os.path.join(base_path, 'discounted_similarity_results.csv')
df_calc = pd.read_csv(input_csv_path)

# Define Thresholds
B1 = 11554
B2 = 16226
B3 = 22691
def calculate_df_factor(n_common, b1, b2, b3):
    if n_common >= b3:
        return 1.0
    elif n_common >= b2:
        return n_common / b3
    elif n_common >= b1:
        return n_common / b2
    else:
        return n_common / b1

def calculate_ds_similarity(raw_similarity, df_factor):
    return raw_similarity * df_factor

print("Computing Discount Factors (DF) and Discounted Similarities (DS)...")

df_calc['df_discount_factor'] = df_calc['N Common Users'].apply(
    lambda x: calculate_df_factor(x, B1, B2, B3)
)

df_calc['ds_discounted_similarity'] = df_calc.apply(
    lambda row: calculate_ds_similarity(row['Similarity Score'], row['df_discount_factor']), 
    axis=1
)

df_calc = df_calc.sort_values(by=['Target Item', 'ds_discounted_similarity'], ascending=[True, False])

df_calc.to_csv(output_discounted_path, index=False)

print(f"\nCalculation Complete. Saved to: {output_discounted_path}")
print("Preview of results:")
print(df_calc[['Target Item', 'Similar Item', 'N Common Users', 'Similarity Score', 'df_discount_factor', 'ds_discounted_similarity']].head(10))

Computing Discount Factors (DF) and Discounted Similarities (DS)...

Calculation Complete. Saved to: Output\discounted_similarity_results.csv
Preview of results:
      Target Item  Similar Item  N Common Users  Similarity Score  \
4820         8860          3977             808          0.329394   
3795         8860          1917             880          0.280880   
3131         8860           367             864          0.279246   
6626         8860          8361             749          0.319723   
3037         8860           185             624          0.377027   
3019         8860           153             734          0.312016   
4227         8860          2701             598          0.362676   
5919         8860          6157             633          0.330628   
4846         8860          4025             647          0.321765   
3349         8860           786             546          0.380336   

      df_discount_factor  ds_discounted_similarity  
4820            0.069932 