section2_neighborhood_cf

Arwa Ahmed Mostafa Shazly (221100209)

Raw Cosine Similarity

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
import time

Loading the needed data from Setion1

In [2]:
ratings = pd.read_csv(r'D:\is\ratings.csv')
ratings_stats = pd.read_csv(r'D:\is\ratings_statistics.csv')
target_users = pd.read_csv(r'D:\is\target_users.csv')
ratings['rating'] = ratings['rating'].round().clip(1, 5)

Creating (userId-movieId) ratings matrix

In [3]:
# Encode userId and movieId (to solve the Non-Sequential IDs problem)
user_enc = LabelEncoder()
item_enc = LabelEncoder()
rows = user_enc.fit_transform(ratings['userId'])
cols = item_enc.fit_transform(ratings['movieId'])
vals = ratings['rating'].values
# Build sparse matrix (removing zeros and store only the actual ratings to save memory)
mtx = csr_matrix((vals, (rows, cols)), shape=(len(user_enc.classes_), len(item_enc.classes_)))
# shows the percentage of the unrated items
sparsity = (1 - mtx.nnz / (mtx.shape[0] * mtx.shape[1])) * 100
print(f"Matrix shape: {mtx.shape}, Sparsity: {sparsity:.2f}%")

Matrix shape: (138493, 26744), Sparsity: 99.46%


1.Calculating Raw cosine similairty 

In [4]:
# 1. Load target users
target_users_df = pd.read_csv(r'D:\is\target_users.csv')
target_user_ids = (
    target_users_df['userId'].tolist() if 'userId' in target_users_df.columns
    else target_users_df['user_id'].tolist() if 'user_id' in target_users_df.columns
    else target_users_df.iloc[:, 0].tolist()
)
target_user_ids = [uid for uid in target_user_ids if uid is not None and not pd.isna(uid)]

# 2. Preprocess ratings into a dictionary for fast lookup
user_ratings_dict = {
    uid: dict(zip(g['movieId'], g['rating']))
    for uid, g in ratings.groupby('userId')
}
all_users = list(user_ratings_dict.keys())

# 3. Compute raw cosine similarity
output_rows = []
start_time = time.time()

for target_user in target_user_ids:
    if target_user not in user_ratings_dict:
        continue
    target_ratings = user_ratings_dict[target_user]

    for other_user in all_users:
        if other_user == target_user:
            continue
        other_ratings = user_ratings_dict[other_user]

        # computing the Co-rated movies
        co_rated = set(target_ratings.keys()) & set(other_ratings.keys())
        if not co_rated:
            similarity = 0.0
        else:
            dot = sum(target_ratings[m] * other_ratings[m] for m in co_rated)
            mag_target = sum(target_ratings[m]**2 for m in co_rated) ** 0.5
            mag_other = sum(other_ratings[m]**2 for m in co_rated) ** 0.5
            similarity = round(dot / (mag_target * mag_other), 4) if mag_target * mag_other != 0 else 0.0

        output_rows.append([target_user, other_user, similarity])
total_elapsed = time.time() - start_time

# 4. Saving the results
user_similarity = pd.DataFrame(output_rows, columns=['target_user', 'other_user', 'similarity'])
output_path = "user_raw_cosine_similarity.csv"
user_similarity.to_csv(output_path, index=False)

print(f"Results saved to: {output_path}")
print("\nFirst 10 of similarity scores for each target user:")
for target_user in target_user_ids:
    target_sims = user_similarity[user_similarity['target_user'] == target_user]
    print(f"\nTarget User {target_user}:")
    print(target_sims.head(10).to_string(index=False))

Results saved to: user_raw_cosine_similarity.csv

First 10 of similarity scores for each target user:

Target User 69251:
 target_user  other_user  similarity
       69251           1      0.8934
       69251           2      0.9660
       69251           3      0.9113
       69251           4      0.9179
       69251           5      0.8779
       69251           6      0.8940
       69251           7      0.9560
       69251           8      0.9187
       69251           9      0.9924
       69251          10      0.9152

Target User 69481:
 target_user  other_user  similarity
       69481           1      0.9828
       69481           2      0.9596
       69481           3      0.9554
       69481           4      0.9334
       69481           5      0.9591
       69481           6      0.9429
       69481           7      0.9450
       69481           8      0.9465
       69481           9      0.9590
       69481          10      0.9924

Target User 67075:
 target_user  other_user

2.Calculate the top 20% most similar users to each target user

In [5]:
top20_results = {}
for uid in target_user_ids:
    sims = user_similarity[user_similarity['target_user'] == uid]
    if sims.empty:
        print(f"Warning: No similarity data for user {uid}")
        continue
    
    # Sort by similarity (high to low)
    sims = sims.sort_values('similarity', ascending=False)
    
    # Top 20% (at least 1)
    top_n = max(1, int(len(sims) * 0.20))
    top20 = sims.head(top_n)
    
    # Save results
    top20_results[uid] = {
        "top20_users": top20['other_user'].values,
        "top20_scores": top20['similarity'].values
    }
    

    print(f"\nTarget User {uid}")
    print(f"Top 20% count: {top_n}")
    print("Top 5 most similar users:")
    print(
        top20[['other_user', 'similarity']]
        .head(5)
        .to_string(index=False)
    )


Target User 69251
Top 20% count: 27698
Top 5 most similar users:
 other_user  similarity
      78155         1.0
      78172         1.0
      78177         1.0
      78136         1.0
     138445         1.0

Target User 69481
Top 20% count: 27698
Top 5 most similar users:
 other_user  similarity
      58541         1.0
      33762         1.0
      58662         1.0
      92296         1.0
      64346         1.0

Target User 67075
Top 20% count: 27698
Top 5 most similar users:
 other_user  similarity
        513         1.0
      70211         1.0
      70343         1.0
      86889         1.0
     133857         1.0


3.Predict ratings for unrated items using top 20% similar users

In [6]:
def predict_ratings_sim(uid, top20_data, ratings_df):
    # Movies rated by target user
    rated = set(ratings_df[ratings_df['userId'] == uid]['movieId'])
    
    # Similar users and similarity scores
    nbrs_df = pd.DataFrame({
        'userId': top20_data['top20_users'],
        'similarity': top20_data['top20_scores']
    })
    
    # Merge neighbor similarities with ratings
    merged = ratings_df.merge(nbrs_df, on='userId')
    
    # Exclude movies already rated by target user
    merged = merged[~merged['movieId'].isin(rated)]
    # Weighted rating
    merged['weighted'] = merged['rating'] * merged['similarity']
    
    # Aggregate
    preds = merged.groupby('movieId').agg({
        'weighted': 'sum',
        'similarity': 'sum'
    }).reset_index()
    
    preds['pred'] = (preds['weighted'] / preds['similarity']).clip(1, 5)
    
    # Sort by predicted rating
    preds = preds[['movieId', 'pred']].sort_values('pred', ascending=False)
    # Return full predictions, but print only top 5
    return preds

In [7]:
prediction_results = {}

for target_user in target_user_ids:
    if target_user not in top20_results:
        print(f"\nWarning: No top 20% data found for user {target_user}")
        continue
    
    print(f"\nTarget User {target_user}:")
    preds = predict_ratings_sim(target_user, top20_results[target_user], ratings)
    prediction_results[target_user] = preds
    
   
    print(preds.head(5).to_string(index=False))
    # Save CSV (optional)
    preds.to_csv(f'predictions_user_{target_user}.csv', index=False)


Target User 69251:
 movieId  pred
  129781   5.0
  130644   5.0
  129013   5.0
    8124   5.0
   83258   5.0

Target User 69481:
 movieId  pred
  119065   5.0
  129741   5.0
  130578   5.0
  130996   5.0
  119563   5.0

Target User 67075:
 movieId  pred
      53   5.0
    5977   5.0
  128600   5.0
    5976   5.0
    1364   5.0


4. Calculate DF and DS for each target user using threshold >= 30%

        4.1 Calculating DF

In [8]:
def calculate_discount_factor(uid, top20_data, ratings_df, beta=0.30):
    # Movies rated by target user
    rated = set(ratings_df[ratings_df['userId'] == uid]['movieId'])
    # Similar users and their similarity scores
    discount_df = pd.DataFrame({
        'userId': top20_data['top20_users'],
        'similarity': top20_data['top20_scores']
    })
    
    # Common items and total items per similar user
    user_rtgs = ratings_df[ratings_df['userId'].isin(discount_df['userId'])]
    common = user_rtgs[user_rtgs['movieId'].isin(rated)].groupby('userId').size()
    total = user_rtgs.groupby('userId').size()
    discount_df['discount_factor'] = discount_df['userId'].map(lambda x: common.get(x, 0) / total.get(x, 1))
    
    # Count passing/failing users
    passed = (discount_df['discount_factor'] >= beta).sum()
    failed = (discount_df['discount_factor'] < beta).sum()
    
    # Print summary
    print(f"Target User {uid}:")
    print(f"  Passed threshold (≥{beta}): {passed}")
    print(f"  Below threshold (<{beta}): {failed}")
    print(f"\n  Top 10 users by discount factor:")
    top10 = discount_df.nlargest(10, 'discount_factor')
    print(top10[['userId', 'similarity', 'discount_factor']].to_string(index=False))
    
    return discount_df


In [14]:
#pplying the discount_factor function to all your target users
discount_factor_results = {}
for target_user in target_user_ids:
    if target_user not in top20_results:
        print(f"No top 20% data found for user {target_user}")
        continue
    discount_factor_results[target_user] = calculate_discount_factor(target_user, top20_results[target_user], ratings)
    

Target User 69251:
  Passed threshold (≥0.3): 50
  Below threshold (<0.3): 27648

  Top 10 users by discount factor:
 userId  similarity  discount_factor
  25981      0.9749         0.450000
 104253      0.9757         0.409091
 109722      0.9754         0.392857
  20531      0.9755         0.384615
  94421      0.9818         0.363636
 134724      0.9814         0.363636
 132617      0.9810         0.357143
  30945      0.9834         0.350000
 111569      0.9825         0.350000
 128196      0.9809         0.350000
Target User 69481:
  Passed threshold (≥0.3): 20632
  Below threshold (<0.3): 7066

  Top 10 users by discount factor:
 userId  similarity  discount_factor
  20600      0.9819         1.000000
  77946      0.9680         0.967742
 103742      0.9750         0.965517
  49395      0.9725         0.958333
  53508      0.9845         0.956522
  48575      0.9689         0.956522
  45201      0.9751         0.950000
  81320      0.9729         0.950000
  49947      0.9694     

          4.2 Calculating DS = DF* cos similarity

In [10]:
import glob
ds_results = {}
# Find all discount factor CSV files I have saved before
csv_files = glob.glob("discount_factor_user_*.csv")

for file in csv_files:
    # Extract target user id from filename
    uid = int(file.split('_')[-1].replace('.csv',''))
    # Load discount factor data
    df = pd.read_csv(file)
    # Calculate DS
    df['DS'] = df['similarity'] * df['discount_factor']
    # Save DS to a new CSV (optional)
    df.to_csv(f"DS_user_{uid}.csv", index=False)
    # Store in results dictionary
    ds_results[uid] = df
    # Print top 5 DS for quick check
    print(f"\nTarget User {uid} - Top 5 DS:")
    print(df.nlargest(5, 'DS')[['userId', 'similarity', 'discount_factor', 'DS']].to_string(index=False))


5.get top 20% most similar users list by DS

In [None]:
def top20_DS(df):
    # Sort descending DS 
    df_sorted = df.sort_values(by='DS', ascending=False)
    # Compute top 20% count (at least 1)
    top_n = max(int(len(df_sorted) * 0.2), 1)
    # Return top 20% users
    return df_sorted.head(top_n).reset_index(drop=True)

In [None]:
top20_new_results = {}
for uid, df in ds_results.items():
    top20_df = top20_DS(df)
    top20_new_results[uid] = top20_df
    print(f"\nTarget User {uid} - Top 20% Most Similar Users (by DS):")
    print(top20_df[['userId', 'DS']].to_string(index=False))

6.Use the updatsed DS to predict missing ratings for each target user

In [None]:
def predict_ratings(target_user, ratings_df, top20_df):
    # Movies already rated by target user
    rated_movies = set(ratings_df[ratings_df['userId'] == target_user]['movieId'])
    # Movies to predict (all movies rated by top 20% neighbors but not target user)
    neighbor_ratings = ratings_df[ratings_df['userId'].isin(top20_df['userId'])]
    movies_to_predict = set(neighbor_ratings['movieId']) - rated_movies
    
    predictions = []
    
    for movie in movies_to_predict:
        # Ratings of top neighbors for this movie
        neighbors_who_rated = top20_df.merge(
            neighbor_ratings[neighbor_ratings['movieId'] == movie], 
            on='userId', how='inner'
        )
        
        if len(neighbors_who_rated) == 0:
            continue  # No neighbors rated this movie
        
        # Weighted average using DS
        weighted_sum = (neighbors_who_rated['DS'] * neighbors_who_rated['rating']).sum()
        sum_ds = neighbors_who_rated['DS'].sum()
        predicted_rating = weighted_sum / sum_ds
        
        predictions.append({'movieId': movie, 'predicted_rating': predicted_rating})
    return pd.DataFrame(predictions).sort_values(by='predicted_rating', ascending=False)

In [None]:
predicted_ratings_results = {}
for uid, top20_df in top20_new_results.items():
    predicted_ratings = predict_ratings(uid, ratings, top20_df)
    predicted_ratings_results[uid] = predicted_ratings
    print(f"\nTarget User {uid} - Top 5 Predicted Ratings:")
    print(predicted_ratings.head(5).to_string(index=False))

7.comparison between points 2 & 5 (the top 20% list) and check if the userID belongs to any of the 2 lists

In [None]:
def compare_top20_lists(uid, top20_cosine, top20_ds):
 # Convert cosine similaity results to DataFrame
    df_cos = pd.DataFrame({
        'userId': top20_cosine['top20_users'],
        'cosine_sim': top20_cosine['top20_scores']
    })
    # DS DataFrame 
    df_ds = top20_ds[['userId', 'DS']]
    # Merge both lists
    merged = df_cos.merge(df_ds, on='userId', how='outer')
    # Add flags
    merged['in_cosine_top20'] = merged['cosine_sim'].notna()
    merged['in_DS_top20'] = merged['DS'].notna()
    # Sort for readability 
    merged = merged.sort_values(
        by=['in_DS_top20','DS','cosine_sim'],
        ascending=[False, False, False]
    ).reset_index(drop=True)
    print(f"\nComparison of Top 20% Lists for Target User {uid}:")
    print(merged.to_string(index=False))
    return merged

In [None]:
uid = target_user_ids[0] 
comparison_table = compare_top20_lists(
    uid,
    top20_results[uid],
    top20_new_results[uid]
)

8. comparison shows a few random movies and how the predicted rating differs between predicting with cos similarity and DS

In [None]:
def compare_predictions(target_user, preds_cosine, preds_ds, sample_size=5):
    # Rename columns
    df1 = preds_cosine[['movieId', 'pred']].rename(columns={'pred': 'pred_cosine'})
    df2 = preds_ds[['movieId', 'predicted_rating']].rename(columns={'predicted_rating': 'pred_ds'})
    # Merge on movieId
    merged = df1.merge(df2, on='movieId', how='inner')
    # Sample N rows (random to saw the difference clearly as the first rows are quite similar)
    sample = merged.sample(n=min(sample_size, len(merged)), random_state=42)
    # Print clean table
    print(f"\nComparison for Target User {target_user}:")
    print(sample.to_string(index=False))
    return sample

In [None]:
user_to_compare = target_user_ids[0]   
# Run comparison
comparison_table = compare_predictions(
    target_user=user_to_compare,
    preds_cosine=prediction_results[user_to_compare],
    preds_ds=predicted_ratings_results[user_to_compare]
)

9. Find users with perfect cosine similrities= 1 with each Target User 

In [None]:
def find_perfect_similarity(target_user, sim_df):
    return sim_df[
        (sim_df['target_user'] == target_user) &
        (sim_df['similarity'] == 1.0)
    ]

In [None]:
all_perfect_results = []   # store results to save later
for target_user in target_user_ids:
    perfect = find_perfect_similarity(target_user, user_similarity)
    print(f"\nTarget User {target_user}:")
    if perfect.empty:
        print("  No perfect similarity users.")
        continue
    
    print(f"  Perfect similarity users: {len(perfect)}")
    # Take only first 10
    top10 = perfect.head(10)

    for _, row in top10.iterrows():
        uid = int(row['other_user'])
        rated_count = ratings[ratings['userId'] == uid].shape[0]
        
        print(f"    User {uid} → rated {rated_count} items")
        # Store in table
        all_perfect_results.append({
            "target_user": target_user,
            "perfect_user": uid,
            "rated_items": rated_count
        })

df_save = pd.DataFrame(all_perfect_results)
df_save.to_csv("perfect_similarity_summary.csv", index=False)

10.working on target users to find how many other users in the dataset have rated at least one of the same movies that the target user has rated.

In [None]:
# 1.Store results for all target users
common_users_results = []
for target_user in target_user_ids:
# 2. get the Movies rated by this target user
    target_movies = set(ratings[ratings['userId'] == target_user]['movieId'])
#3. get Users who rated any of these movies
    common_users = (
        ratings[ratings['movieId'].isin(target_movies)]['userId']
        .unique()
    )
    
# 4. Exclude the target user itself
    common_users = [u for u in common_users if u != target_user]
    
    common_users_results.append({
        "target_user": target_user,
        "num_target_movies": len(target_movies),
        "num_common_users": len(common_users)
    })

# Convert to DataFrame for display or CSV
df_common_users = pd.DataFrame(common_users_results)
print(df_common_users)


Menna salem elsayed
Id:221101277

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
path = r"D:\is\ratings.csv"
df = pd.read_csv(path)

print(df.head())       

   userId  movieId  rating   timestamp
0       1        2     3.5  1112486027
1       1       29     3.5  1112484676
2       1       32     3.5  1112484819
3       1       47     3.5  1112484727
4       1       50     3.5  1112484580


In [2]:
targets = {
    "U1": 69251,
    "U2": 69481,
    "U3": 67075
}

**# CASE STUDY 2 – Mean-Centered Cosine**

In [18]:
#case_study2 Mean-Centered Cosine
user_mean = df.groupby('userId')['rating'].mean()

# 1. Mean-Centered Cosine
def mc_cosine_for_target(df, target_id, user_mean):

    ru = df[df.userId == target_id][['movieId', 'rating']]
    ru = ru.rename(columns={'rating': 'rating_u'})

    merged = df.merge(ru, on='movieId', how='inner')
    merged = merged[merged.userId != target_id]

    mu_u = user_mean.loc[target_id]

    def mc_group(g):
        n_uv = len(g)
        if n_uv < 2:
            return 0.0

        mu_v = user_mean.loc[g.name]

        x = g['rating']    - mu_v   
        y = g['rating_u']  - mu_u   

        num = np.dot(x, y)
        den = np.sqrt((x**2).sum()) * np.sqrt((y**2).sum())

        if den == 0:
            return 0.0

        return num / den

    mc_vals = merged.groupby('userId').apply(mc_group)

    out = mc_vals.reset_index()
    out.columns = ['OtherUserId', 'MC']
    out['TargetUserId'] = target_id
    return out[['TargetUserId', 'OtherUserId', 'MC']]

mc_list = []

for label, uid in targets.items():
    print(f"\nComputing Mean-Centered Cosine for {label} (userId={uid}) ...")
    temp = mc_cosine_for_target(df, uid, user_mean)
    temp['TargetUser'] = label
    mc_list.append(temp)

mc_df = pd.concat(mc_list, ignore_index=True)
mc_df = mc_df[['TargetUser', 'TargetUserId', 'OtherUserId', 'MC']]

mc_df['MC'] = mc_df['MC'].fillna(0.0)

print("\nSample Mean-Centered Cosine results:")
print(mc_df.head())

mc_path = r"D:\is\sectionTWO_case_study2_MC.csv"
mc_df.to_csv(mc_path, index=False)
print("\nMC file saved to:", mc_path)


Computing Mean-Centered Cosine for U1 (userId=69251) ...


  mc_vals = merged.groupby('userId').apply(mc_group)



Computing Mean-Centered Cosine for U2 (userId=69481) ...


  mc_vals = merged.groupby('userId').apply(mc_group)



Computing Mean-Centered Cosine for U3 (userId=67075) ...


  mc_vals = merged.groupby('userId').apply(mc_group)



Sample Mean-Centered Cosine results:
  TargetUser  TargetUserId  OtherUserId        MC
0         U1         69251            1 -0.300013
1         U1         69251            2  0.320597
2         U1         69251            3  0.108990
3         U1         69251            4  0.720669
4         U1         69251            5  0.302579

MC file saved to: D:\is\sectionTWO_case_study2_MC.csv


In [19]:
#2. SELECT TOP 20% NEIGHBORS FOR EACH TARGET USER (by MC)

neighbors_list = []

for label in mc_df['TargetUser'].unique():
    sub = mc_df[(mc_df['TargetUser'] == label) & (mc_df['MC'] > 0)]

    n = len(sub)
    if n == 0:
        print(f"Warning: no positive MC neighbors for {label}")
        continue

    k = max(1, int(np.ceil(0.2 * n)))  

    top_k = sub.sort_values('MC', ascending=False).head(k)
    neighbors_list.append(top_k)

neighbors_mc = pd.concat(neighbors_list, ignore_index=True)

print("\nSample of top 20% MC neighbors:")
print(neighbors_mc.head())

neighbors_path = r"D:\is\sectionTWO_case_study2_top20_neighbors_MC.csv"
neighbors_mc.to_csv(neighbors_path, index=False)
print("\nTop-20% MC neighbors file saved to:", neighbors_path)



Sample of top 20% MC neighbors:
  TargetUser  TargetUserId  OtherUserId   MC
0         U1         69251       138440  1.0
1         U1         69251        79335  1.0
2         U1         69251       107400  1.0
3         U1         69251        17405  1.0
4         U1         69251       115032  1.0

Top-20% MC neighbors file saved to: D:\is\sectionTWO_case_study2_top20_neighbors_MC.csv


In [20]:
neighbors_mc = pd.read_csv(
    r"D:\is\sectionTWO_case_study2_top20_neighbors_MC.csv"
)

In [21]:
#  3. PREDICT MISSING RATINGS USING MC

item_mean   = df.groupby('movieId')['rating'].mean()
global_mean = df['rating'].mean()

pred_list = []

for label, u in targets.items():

    print(f"Fast predicting (MC) for {label} (userId={u})")

    neigh = neighbors_mc[neighbors_mc.TargetUserId == u][
        ['OtherUserId','MC']
    ]
    if neigh.empty:
        continue

    rated_items = df.loc[df.userId == u, 'movieId']

    temp = df[
        df.userId.isin(neigh.OtherUserId) &
        (~df.movieId.isin(rated_items))
    ][['userId','movieId','rating']].copy()

    if temp.empty:
        continue

    temp['sim']  = temp['userId'].map(
        neigh.set_index('OtherUserId')['MC']
    )
    temp['mu_v'] = temp['userId'].map(user_mean)

    temp.dropna(inplace=True)

    temp['wd'] = temp['sim'] * (temp['rating'] - temp['mu_v'])

    g = temp.groupby('movieId').agg(
        num=('wd','sum'),
        den=('sim', lambda x: np.abs(x).sum())
    )
    g['PredRating'] = user_mean[u] + g['num'] / g['den']
    g['PredRating'].fillna(item_mean, inplace=True)
    g['PredRating'].fillna(user_mean[u], inplace=True)
    g['PredRating'].fillna(global_mean, inplace=True)

    g = g.reset_index()
    g['TargetUser']   = label
    g['TargetUserId'] = u

    pred_list.append(
        g[['TargetUser','TargetUserId','movieId','PredRating']]
    )

pred_df_mc = pd.concat(pred_list, ignore_index=True)

print("\nSample MC predictions:")
print(pred_df_mc.head())

pred_df_mc.to_csv(
    r"D:\is\sectionTWO_case_study2_predictions_MC.csv",
    index=False
)


Fast predicting (MC) for U1 (userId=69251)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

Fast predicting (MC) for U2 (userId=69481)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

Fast predicting (MC) for U3 (userId=67075)

Sample MC predictions:
  TargetUser  TargetUserId  movieId  PredRating
0         U1         69251        2    2.685367
1         U1         69251        3    2.453100
2         U1         69251        4    2.248016
3         U1         69251        5    2.372975
4         U1         69251        6    3.012407


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [22]:
#4. compute DF and DS
user_n   = df.groupby('userId')['movieId'].nunique()
user_mean = df.groupby('userId')['rating'].mean()
beta = 0.30  


def mc_df_for_target_with_DF_DS(df, target_id, beta, user_n, user_mean):

    ru = df[df.userId == target_id][['movieId', 'rating']]
    ru = ru.rename(columns={'rating': 'rating_u'})

    merged = df.merge(ru, on='movieId', how='inner')
    merged = merged[merged.userId != target_id]

    n_u = user_n.loc[target_id]
    threshold = beta * n_u          
    mu_u = user_mean.loc[target_id] 

    def mc_group(g):
        n_uv = len(g)

        if n_uv < 2:
            mc = 0.0
        else:
            mu_v = user_mean.loc[g.name]   

            x = g['rating'].to_numpy()   - mu_v 
            y = g['rating_u'].to_numpy() - mu_u  

            num = (x * y).sum()
            den = np.sqrt((x**2).sum()) * np.sqrt((y**2).sum())

            if den == 0:
                mc = 0.0
            else:
                mc = num / den

        if threshold > 0:
            df_val = min(1.0, n_uv / threshold)
        else:
            df_val = 0.0

        ds_val = mc * df_val

        return pd.Series({
            'Overlap': n_uv,
            'MC': mc,
            'DF': df_val,
            'DS': ds_val
        })

    stats = merged.groupby('userId').apply(mc_group).reset_index()
    stats = stats.rename(columns={'userId': 'OtherUserId'})
    stats['TargetUserId'] = target_id

    return stats[['TargetUserId', 'OtherUserId', 'Overlap', 'MC', 'DF', 'DS']]

results = []

for label, uid in targets.items():
    print(f"Computing MC + DF + DS for {label} (userId={uid}) ...")
    temp = mc_df_for_target_with_DF_DS(df, uid, beta, user_n, user_mean)
    temp['TargetUser'] = label
    results.append(temp)

df_ds_mc = pd.concat(results, ignore_index=True)

df_ds_mc = df_ds_mc[['TargetUser', 'TargetUserId', 'OtherUserId',
                     'Overlap', 'MC', 'DF', 'DS']]

print("\nSample (MC + DF + DS):")
print(df_ds_mc.head())

output_path = r"D:\is\sectionTWO_case_study2_MC_DF_DS.csv"
df_ds_mc.to_csv(output_path, index=False)

print(output_path)
print("Total rows:", len(df_ds_mc))

Computing MC + DF + DS for U1 (userId=69251) ...


  stats = merged.groupby('userId').apply(mc_group).reset_index()


Computing MC + DF + DS for U2 (userId=69481) ...


  stats = merged.groupby('userId').apply(mc_group).reset_index()


Computing MC + DF + DS for U3 (userId=67075) ...


  stats = merged.groupby('userId').apply(mc_group).reset_index()



Sample (MC + DF + DS):
  TargetUser  TargetUserId  OtherUserId  Overlap        MC       DF        DS
0         U1         69251            1      9.0 -0.300013  0.20979 -0.062940
1         U1         69251            2      4.0  0.320597  0.09324  0.029893
2         U1         69251            3     16.0  0.108990  0.37296  0.040649
3         U1         69251            4      4.0  0.720669  0.09324  0.067195
4         U1         69251            5      8.0  0.302579  0.18648  0.056425
D:\is\sectionTWO_case_study2_MC_DF_DS.csv
Total rows: 409181


In [23]:
#5.Select TOP 20% neighbors by DS
df_ds_mc = pd.read_csv(
    r"D:\is\sectionTWO_case_study2_MC_DF_DS.csv"
)

top_neighbors_ds_mc = []

for label in df_ds_mc['TargetUser'].unique():
    sub = df_ds_mc[
        (df_ds_mc['TargetUser'] == label) &
        (df_ds_mc['DS'] > 0)
    ]

    if sub.empty:
        print(f"No DS-MC neighbors for {label}")
        continue

    k = max(1, int(np.ceil(0.2 * len(sub))))

    top_k = sub.sort_values('DS', ascending=False).head(k)
    top_neighbors_ds_mc.append(top_k)

neighbors_ds_mc = pd.concat(top_neighbors_ds_mc, ignore_index=True)

print("\nSample TOP 20% DS-MC neighbors:")
print(neighbors_ds_mc.head())

output_path = r"D:\is\sectionTWO_case_study2_top20_neighbors_DS_MC.csv"
neighbors_ds_mc.to_csv(output_path, index=False)

print("\nSaved TOP 20% DS-MC neighbors to:")
print(output_path)
print("Total neighbors:", len(neighbors_ds_mc))



Sample TOP 20% DS-MC neighbors:
  TargetUser  TargetUserId  OtherUserId  Overlap        MC   DF        DS
0         U1         69251        69254     59.0  0.716112  1.0  0.716112
1         U1         69251        45650     64.0  0.643277  1.0  0.643277
2         U1         69251        91020     55.0  0.628524  1.0  0.628524
3         U1         69251        69662     60.0  0.625327  1.0  0.625327
4         U1         69251       131077     45.0  0.615498  1.0  0.615498

Saved TOP 20% DS-MC neighbors to:
D:\is\sectionTWO_case_study2_top20_neighbors_DS_MC.csv
Total neighbors: 63072


In [24]:
#6.Predict ratings 20% similar users.
neighbors_ds_mc = pd.read_csv(
    r"D:\is\sectionTWO_case_study2_top20_neighbors_DS_MC.csv"
)

user_mean   = df.groupby('userId')['rating'].mean()
item_mean   = df.groupby('movieId')['rating'].mean()
global_mean = df['rating'].mean()

pred_list = []

for label, u in targets.items():

    print(f"DS-MC predicting for {label} (userId={u})")

    neigh = neighbors_ds_mc[
        neighbors_ds_mc.TargetUserId == u
    ][['OtherUserId','DS']]

    if neigh.empty:
        continue

    rated_movies = df.loc[df.userId == u, 'movieId']

    temp = df[
        df.userId.isin(neigh.OtherUserId) &
        (~df.movieId.isin(rated_movies))
    ][['userId','movieId','rating']].copy()

    if temp.empty:
        continue

    temp['sim']  = temp['userId'].map(
        neigh.set_index('OtherUserId')['DS']
    )
    temp['mu_v'] = temp['userId'].map(user_mean)
    temp.dropna(inplace=True)

    temp['wd'] = temp['sim'] * (temp['rating'] - temp['mu_v'])

    g = temp.groupby('movieId').agg(
        num=('wd','sum'),
        den=('sim', lambda x: np.abs(x).sum())
    )

    g['PredRating'] = user_mean[u] + g['num'] / g['den']
    g['PredRating'].fillna(item_mean, inplace=True)
    g['PredRating'].fillna(user_mean[u], inplace=True)
    g['PredRating'].fillna(global_mean, inplace=True)

    g = g.reset_index()
    g['TargetUser']   = label
    g['TargetUserId'] = u

    pred_list.append(
        g[['TargetUser','TargetUserId','movieId','PredRating']]
    )

pred_df_ds_mc = pd.concat(pred_list, ignore_index=True)

print("\nSample DS-MC predictions:")
print(pred_df_ds_mc.head())

pred_df_ds_mc.to_csv(
    r"D:\is\sectionTWO_case_study2_predictions_DS_MC.csv",
    index=False
)

print("\nSaved DS-MC predictions to:",
      r"D:\is\sectionTWO_case_study2_predictions_DS_MC.csv")


DS-MC predicting for U1 (userId=69251)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

DS-MC predicting for U2 (userId=69481)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

DS-MC predicting for U3 (userId=67075)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v


Sample DS-MC predictions:
  TargetUser  TargetUserId  movieId  PredRating
0         U1         69251        2    2.541863
1         U1         69251        3    2.482303
2         U1         69251        4    2.187732
3         U1         69251        5    2.398940
4         U1         69251        6    3.172348

Saved DS-MC predictions to: D:\is\sectionTWO_case_study2_predictions_DS_MC.csv


In [25]:
#7.Compare the top-k user lists from steps 2 and 5 and 
import pandas as pd

mc_neighbors = pd.read_csv(
    r"D:\is\sectionTWO_case_study2_top20_neighbors_MC.csv"
)

ds_neighbors_mc = pd.read_csv(
    r"D:\is\sectionTWO_case_study2_top20_neighbors_DS_MC.csv"
)

summary_rows = []

for label in sorted(mc_neighbors['TargetUser'].unique()):
    mc_sub = mc_neighbors[mc_neighbors['TargetUser'] == label]
    ds_sub = ds_neighbors_mc[ds_neighbors_mc['TargetUser'] == label]

    mc_set = set(mc_sub['OtherUserId'])
    ds_set = set(ds_sub['OtherUserId'])

    common   = mc_set & ds_set
    only_mc  = mc_set - ds_set
    only_ds  = ds_set - mc_set

    summary_rows.append({
        "TargetUser": label,
        "MC_count": len(mc_set),
        "DS_MC_count": len(ds_set),
        "Common_neighbors": len(common),
        "Only_in_MC": len(only_mc),
        "Only_in_DS_MC": len(only_ds)
    })

summary_df_mc = pd.DataFrame(summary_rows)

print("\nSummary comparison (MC vs DS-MC neighbors):")
print(summary_df_mc)



Summary comparison (MC vs DS-MC neighbors):
  TargetUser  MC_count  DS_MC_count  Common_neighbors  Only_in_MC  \
0         U1     16570        16570              2325       14245   
1         U2     23303        23304              5015       18288   
2         U3     23197        23198              4732       18465   

   Only_in_DS_MC  
0          14245  
1          18289  
2          18466  


In [26]:
#8.Compare predictions from steps 3 and 6
import pandas as pd

pred_mc = pd.read_csv(
    r"D:\is\sectionTWO_case_study2_predictions_MC.csv"
)

pred_mc = pred_mc.rename(columns={
    "PredRating": "PredRating_MC"
})

pred_ds_mc = pd.read_csv(
    r"D:\is\sectionTWO_case_study2_predictions_DS_MC.csv"
)

pred_ds_mc = pred_ds_mc.rename(columns={
    "PredRating": "PredRating_DS_MC"
})

pred_compare_mc = pred_mc.merge(
    pred_ds_mc,
    on=["TargetUser", "TargetUserId", "movieId"],
    how="inner"
)

pred_compare_mc["Diff_DS_MC_minus_MC"] = (
    pred_compare_mc["PredRating_DS_MC"]
    - pred_compare_mc["PredRating_MC"]
)

print("Comparison MC vs DS-MC:")
print(pred_compare_mc.head())

pred_compare_mc.to_csv(
    r"D:\is\sectionTWO_case_study2_prediction_comparison_MC.csv",
    index=False
)


Comparison MC vs DS-MC:
  TargetUser  TargetUserId  movieId  PredRating_MC  PredRating_DS_MC  \
0         U1         69251        2       2.685367          2.541863   
1         U1         69251        3       2.453100          2.482303   
2         U1         69251        4       2.248016          2.187732   
3         U1         69251        5       2.372975          2.398940   
4         U1         69251        6       3.012407          3.172348   

   Diff_DS_MC_minus_MC  
0            -0.143505  
1             0.029203  
2            -0.060284  
3             0.025966  
4             0.159941  


In [None]:
import pandas as pd

#9 flip from high raw Cosine to MC 

# 1) Raw cosine similarities
raw_cos = pd.read_csv(
    r"D:\is\section2_neighborhood_cf\user_raw_cosine_similarity.csv"
)
print("Raw Cosine columns before rename:", raw_cos.columns)

# Assume the first 3 columns are:

raw_cos = raw_cos.rename(columns={
    raw_cos.columns[0]: "TargetUserId",
    raw_cos.columns[1]: "OtherUserId",
    raw_cos.columns[2]: "COS"
})

print("Raw Cosine columns after rename:", raw_cos.columns)

# 2) Mean-centered cosine similarities (from your case study 2 code)
mc_df = pd.read_csv(
    r"D:\is\section2_neighborhood_cf\sectionTWO_case_study2_MC.csv"
)
print("MC columns:", mc_df.columns)
# Expected: ['TargetUser', 'TargetUserId', 'OtherUserId', 'MC']

# 3) Merge raw cosine and MC on user pair (TargetUserId, OtherUserId)
sim_compare = raw_cos.merge(
    mc_df[["TargetUserId", "OtherUserId", "MC"]],
    on=["TargetUserId", "OtherUserId"],
    how="inner"
)

# 4) Users with high raw cosine but strongly negative mean-centered cosine
flipped_users = sim_compare[
    (sim_compare["COS"] > 0.8) &      # highly +ve raw Cosine
    (sim_compare["MC"]  <= -0.99)     # almost -1 after mean-centering
]

print("\nUsers flipped from high Cosine to MC ≈ -1:")
print(flipped_users.head())

# 5) Users that stay highly +ve in BOTH measures (more reliable neighbors)
consistent_users = sim_compare[
    (sim_compare["COS"] > 0.8) &
    (sim_compare["MC"]  > 0.8)
]

print("\nUsers with high raw Cosine AND high MC (more reliable):")
print(consistent_users.head())

Raw Cosine columns before rename: Index(['target_user', 'other_user', 'similarity'], dtype='object')
Raw Cosine columns after rename: Index(['TargetUserId', 'OtherUserId', 'COS'], dtype='object')
MC columns: Index(['TargetUser', 'TargetUserId', 'OtherUserId', 'MC'], dtype='object')

Users flipped from high Cosine to MC ≈ -1:
     TargetUserId  OtherUserId     COS        MC
29          69251           31  1.0000 -1.000000
50          69251           52  1.0000 -1.000000
174         69251          182  0.9487 -0.999986
283         69251          301  1.0000 -1.000000
513         69251          551  0.9037 -0.995252

Users with high raw Cosine AND high MC (more reliable):
     TargetUserId  OtherUserId     COS        MC
8           69251            9  0.9924  0.826231
55          69251           57  0.9806  0.952341
62          69251           65  1.0000  0.917178
78          69251           81  0.9949  0.970077
153         69251          159  1.0000  0.876624


In [None]:
#

user_stats = df.groupby("userId")["rating"].agg(
    num_ratings="count",
    mean_rating="mean"
).reset_index()

# Favorites-only users
favorites_only = user_stats[
    (user_stats["num_ratings"] < 30) &
    (user_stats["mean_rating"] >= 4.0)
]

# Full-list users
full_list_users = user_stats[
    (user_stats["num_ratings"] >= 200)
]

print("Favorites-only users:")
print(favorites_only.head())

print("\nFull-list users:")
print(full_list_users.head())


# MEAN-CENTERED COSINE SIMILARITY

mc_df = pd.read_csv(
    r"D:\is\section2_neighborhood_cf\sectionTWO_case_study2_MC.csv"
)

fav_ids  = set(favorites_only["userId"])
full_ids = set(full_list_users["userId"])

avg_mc_fav  = mc_df[mc_df.TargetUserId.isin(fav_ids)]["MC"].mean()
avg_mc_full = mc_df[mc_df.TargetUserId.isin(full_ids)]["MC"].mean()

print("\nAverage MC similarity (favorites-only users):", avg_mc_fav)
print("Average MC similarity (full-list users):", avg_mc_full)


Favorites-only users:
     userId  num_ratings  mean_rating
16       17           26     4.038462
56       57           24     4.104167
61       62           27     4.388889
80       81           23     4.739130
112     113           28     4.392857

Full-list users:
    userId  num_ratings  mean_rating
6        7          276     3.289855
10      11          504     3.945437
13      14          243     3.751029
23      24          506     3.332016
30      31          246     3.376016

Average MC similarity (favorites-only users): nan
Average MC similarity (full-list users): 0.2004802713302753


# CASE STUDY 3 – Pearson Correlation 

In [36]:
#1.Use Pearson Correlation Coefficient (PCC)
def pcc_for_target(df, target_id):

    ru = df[df.userId == target_id][['movieId', 'rating']]
    ru = ru.rename(columns={'rating': 'rating_u'})

    merged = df.merge(ru, on='movieId', how='inner')
    merged = merged[merged.userId != target_id]

    def pcc_pair(g):
        n_uv = len(g)
        if n_uv < 2:
            return 0.0

        x = g['rating'].to_numpy()   
        y = g['rating_u'].to_numpy()  

        mu_x = x.mean()
        mu_y = y.mean()

        num = ((x - mu_x) * (y - mu_y)).sum()
        den = np.sqrt(((x - mu_x) ** 2).sum()) * \
              np.sqrt(((y - mu_y) ** 2).sum())

        if den == 0:
            return 0.0

        return num / den
    pcc_vals = merged.groupby('userId').apply(pcc_pair)

    out = pcc_vals.reset_index()
    out.columns = ['OtherUserId', 'PCC']
    out['TargetUserId'] = target_id
    return out[['TargetUserId', 'OtherUserId', 'PCC']]

pcc_list = []

for label, uid in targets.items():
    print(f"\nComputing PCC (slide-style) for {label} (userId={uid}) ...")
    temp = pcc_for_target(df, uid)
    temp['TargetUser'] = label
    pcc_list.append(temp)

pcc_df = pd.concat(pcc_list, ignore_index=True)
pcc_df = pcc_df[['TargetUser', 'TargetUserId', 'OtherUserId', 'PCC']]

pcc_df['PCC'] = pcc_df['PCC'].fillna(0.0)

print("\nSample PCC (slide-style) results:")
print(pcc_df.head())

pcc_path = r"D:\is\sectionTWO_case_study3_pcc.csv"
pcc_df.to_csv(pcc_path, index=False)
print("\nPCC file saved to:", pcc_path)



Computing PCC (slide-style) for U1 (userId=69251) ...


  pcc_vals = merged.groupby('userId').apply(pcc_pair)



Computing PCC (slide-style) for U2 (userId=69481) ...


  pcc_vals = merged.groupby('userId').apply(pcc_pair)



Computing PCC (slide-style) for U3 (userId=67075) ...


  pcc_vals = merged.groupby('userId').apply(pcc_pair)



Sample PCC (slide-style) results:
  TargetUser  TargetUserId  OtherUserId       PCC
0         U1         69251            1 -0.302416
1         U1         69251            2  0.471405
2         U1         69251            3  0.187205
3         U1         69251            4  0.730297
4         U1         69251            5  0.346569

PCC file saved to: D:\is\sectionTWO_case_study3_pcc.csv


In [37]:
#compute PCC for all target users 
pcc_list = []

for label, uid in targets.items():
    print(f"\nComputing PCC for {label} (userId={uid}) ...")
    temp = pcc_for_target(df, uid)
    temp['TargetUser'] = label
    pcc_list.append(temp)

pcc_df = pd.concat(pcc_list, ignore_index=True)
pcc_df = pcc_df[['TargetUser', 'TargetUserId', 'OtherUserId', 'PCC']]

pcc_df['PCC'] = pcc_df['PCC'].fillna(0.0)

print("\nSample PCC results ")
print(pcc_df.head())

pcc_path = r"D:\is\sectionTWO_case_study3_pcc.csv"
pcc_df.to_csv(pcc_path, index=False)
print("\nPCC file saved to:", pcc_path)


Computing PCC for U1 (userId=69251) ...


  pcc_vals = merged.groupby('userId').apply(pcc_pair)



Computing PCC for U2 (userId=69481) ...


  pcc_vals = merged.groupby('userId').apply(pcc_pair)



Computing PCC for U3 (userId=67075) ...


  pcc_vals = merged.groupby('userId').apply(pcc_pair)



Sample PCC results 
  TargetUser  TargetUserId  OtherUserId       PCC
0         U1         69251            1 -0.302416
1         U1         69251            2  0.471405
2         U1         69251            3  0.187205
3         U1         69251            4  0.730297
4         U1         69251            5  0.346569

PCC file saved to: D:\is\sectionTWO_case_study3_pcc.csv


In [38]:
#2. SELECT TOP 20% NEIGHBORS FOR EACH TARGET USER
neighbors_list = []

for label in pcc_df['TargetUser'].unique():
    sub = pcc_df[(pcc_df['TargetUser'] == label) & (pcc_df['PCC'] > 0)]

    n = len(sub)
    if n == 0:
        print(f"Warning: no positive neighbors for {label}")
        continue

    k = max(1, int(np.ceil(0.2 * n)))  

    top_k = sub.sort_values('PCC', ascending=False).head(k)
    neighbors_list.append(top_k)

neighbors_df = pd.concat(neighbors_list, ignore_index=True)

print("\nSample of top 20% neighbors:")
print(neighbors_df.head())

neighbors_path = r"D:\is\sectionTWO_case_study3_top20_neighbors.csv"
neighbors_df.to_csv(neighbors_path, index=False)
print("\nTop-20% neighbors file saved to:", neighbors_path)


Sample of top 20% neighbors:
  TargetUser  TargetUserId  OtherUserId  PCC
0         U1         69251        60782  1.0
1         U1         69251       114139  1.0
2         U1         69251        51257  1.0
3         U1         69251        74949  1.0
4         U1         69251         1818  1.0

Top-20% neighbors file saved to: D:\is\sectionTWO_case_study3_top20_neighbors.csv


In [39]:
#3.Predict the missing ratings 
# top-20% neighbors from Step 2
neighbors_pcc = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_top20_neighbors.csv"
)


In [40]:
user_mean = df.groupby('userId')['rating'].mean()
item_mean = df.groupby('movieId')['rating'].mean()
global_mean = df['rating'].mean()

In [41]:

all_items = df['movieId'].unique()
pred_list = []

for label, u in targets.items():

    print(f"predicting for {label} (userId={u})")
    neigh = neighbors_pcc[neighbors_pcc.TargetUserId == u][
        ['OtherUserId','PCC']
    ]
    if neigh.empty:
        continue

    rated_items = df.loc[df.userId == u, 'movieId']

    temp = df[
        df.userId.isin(neigh.OtherUserId) &
        (~df.movieId.isin(rated_items))
    ][['userId','movieId','rating']].copy()

    if temp.empty:
        continue
    temp['sim']  = temp['userId'].map(neigh.set_index('OtherUserId')['PCC'])
    temp['mu_v'] = temp['userId'].map(user_mean)

    temp.dropna(inplace=True)

    temp['wd'] = temp['sim'] * (temp['rating'] - temp['mu_v'])

    g = temp.groupby('movieId').agg(
        num=('wd','sum'),
        den=('sim', lambda x: np.abs(x).sum())
    )

    g['PredRating'] = user_mean[u] + g['num'] / g['den']

    g['PredRating'].fillna(item_mean, inplace=True)
    g['PredRating'].fillna(user_mean[u], inplace=True)
    g['PredRating'].fillna(global_mean, inplace=True)

    g = g.reset_index()
    g['TargetUser'] = label
    g['TargetUserId'] = u

    pred_list.append(
        g[['TargetUser','TargetUserId','movieId','PredRating']]
    )


pred_df = pd.concat(pred_list, ignore_index=True)

print(pred_df.head())

pred_df.to_csv(
    r"D:\is\sectionTWO_case_study3_predictions_all_items_pcc.csv",
    index=False
)



predicting for U1 (userId=69251)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

predicting for U2 (userId=69481)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

predicting for U3 (userId=67075)
  TargetUser  TargetUserId  movieId  PredRating
0         U1         69251        2    2.604532
1         U1         69251        3    2.422639
2         U1         69251        4    2.187283
3         U1         69251        5    2.336086
4         U1         69251        6    2.985428


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [42]:
user_n = df.groupby('userId')['movieId'].nunique()
beta = 0.30   

# 4. compute DF and DS
def pcc_df_for_target_with_DF_DS(df, target_id, beta, user_n):

    ru = df[df.userId == target_id][['movieId', 'rating']]
    ru = ru.rename(columns={'rating': 'rating_u'})

    merged = df.merge(ru, on='movieId', how='inner')
    merged = merged[merged.userId != target_id]

    n_u = user_n.loc[target_id]
    threshold = beta * n_u  

    def pcc_group(g):
        n_uv = len(g) 

        if n_uv < 2:
            pcc = 0.0
        else:
            x = g['rating'].to_numpy()
            y = g['rating_u'].to_numpy()

            x_mean = x.mean()
            y_mean = y.mean()

            num = ((x - x_mean) * (y - y_mean)).sum()
            den = np.sqrt(((x - x_mean)**2).sum()) * \
                  np.sqrt(((y - y_mean)**2).sum())

            if den == 0:
                pcc = 0.0
            else:
                pcc = num / den
        if threshold > 0:
            df_val = min(1.0, n_uv / threshold)
        else:
            df_val = 0.0

        ds_val = pcc * df_val

        return pd.Series({
            'Overlap': n_uv,
            'PCC': pcc,
            'DF': df_val,
            'DS': ds_val
        })

    stats = merged.groupby('userId').apply(pcc_group).reset_index()
    stats = stats.rename(columns={'userId': 'OtherUserId'})
    stats['TargetUserId'] = target_id

    return stats[['TargetUserId', 'OtherUserId', 'Overlap', 'PCC', 'DF', 'DS']]


In [43]:
results = []

for label, uid in targets.items():
    print(f"Computing PCC + DF + DS for {label} (userId={uid}) ...")
    temp = pcc_df_for_target_with_DF_DS(df, uid, beta, user_n)
    temp['TargetUser'] = label
    results.append(temp)

df_ds = pd.concat(results, ignore_index=True)

df_ds = df_ds[['TargetUser', 'TargetUserId', 'OtherUserId',
               'Overlap', 'PCC', 'DF', 'DS']]

print("\nSample (PCC + DF + DS):")
print(df_ds.head())

output_path = r"D:\is\sectionTWO_case_study3_PCC_DF_DS.csv"
df_ds.to_csv(output_path, index=False)

print("\nSaved PCC + DF + DS to:")
print(output_path)
print("Total rows:", len(df_ds))

Computing PCC + DF + DS for U1 (userId=69251) ...


  stats = merged.groupby('userId').apply(pcc_group).reset_index()


Computing PCC + DF + DS for U2 (userId=69481) ...


  stats = merged.groupby('userId').apply(pcc_group).reset_index()


Computing PCC + DF + DS for U3 (userId=67075) ...


  stats = merged.groupby('userId').apply(pcc_group).reset_index()



Sample (PCC + DF + DS):
  TargetUser  TargetUserId  OtherUserId  Overlap       PCC       DF        DS
0         U1         69251            1      9.0 -0.302416  0.20979 -0.063444
1         U1         69251            2      4.0  0.471405  0.09324  0.043954
2         U1         69251            3     16.0  0.187205  0.37296  0.069820
3         U1         69251            4      4.0  0.730297  0.09324  0.068093
4         U1         69251            5      8.0  0.346569  0.18648  0.064628

Saved PCC + DF + DS to:
D:\is\sectionTWO_case_study3_PCC_DF_DS.csv
Total rows: 409181


In [44]:
df_ds = pd.read_csv(r"D:\is\sectionTWO_case_study3_PCC_DF_DS.csv")

In [45]:
# 5.Select TOP 20% neighbors by DS

top_neighbors_ds = []

for label in df_ds['TargetUser'].unique():
    sub = df_ds[
        (df_ds['TargetUser'] == label) &
        (df_ds['DS'] > 0)
    ]

    if sub.empty:
        print(f"No DS neighbors for {label}")
        continue
    k = max(1, int(np.ceil(0.2 * len(sub))))

    top_k = sub.sort_values('DS', ascending=False).head(k)
    top_neighbors_ds.append(top_k)

neighbors_ds = pd.concat(top_neighbors_ds, ignore_index=True)

print("\nSample TOP 20% DS neighbors:")
print(neighbors_ds.head())

output_path = r"D:\is\sectionTWO_case_study3_top20_neighbors_DS.csv"
neighbors_ds.to_csv(output_path, index=False)

print("\nSaved TOP 20% DS neighbors to:")
print(output_path)
print("Total neighbors:", len(neighbors_ds))



Sample TOP 20% DS neighbors:
  TargetUser  TargetUserId  OtherUserId  Overlap       PCC        DF        DS
0         U1         69251        69254     59.0  0.702030  1.000000  0.702030
1         U1         69251        45650     64.0  0.642619  1.000000  0.642619
2         U1         69251        29507     40.0  0.688991  0.932401  0.642415
3         U1         69251        69662     60.0  0.629643  1.000000  0.629643
4         U1         69251       105654     45.0  0.624470  1.000000  0.624470

Saved TOP 20% DS neighbors to:
D:\is\sectionTWO_case_study3_top20_neighbors_DS.csv
Total neighbors: 63202


In [46]:
# 6. prediction using Discounted Similarity (DS) 

import numpy as np
import pandas as pd

neighbors_ds = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_top20_neighbors_DS.csv"
)

user_mean   = df.groupby('userId')['rating'].mean()
item_mean   = df.groupby('movieId')['rating'].mean()
global_mean = df['rating'].mean()

pred_list = []

for label, u in targets.items():

    print(f"DS predicting for {label} (userId={u})")

    neigh = neighbors_ds[
        neighbors_ds.TargetUserId == u
    ][['OtherUserId','DS']]

    if neigh.empty:
        continue

    rated_movies = df.loc[df.userId == u, 'movieId']

    temp = df[
        df.userId.isin(neigh.OtherUserId) &
        (~df.movieId.isin(rated_movies))
    ][['userId','movieId','rating']].copy()

    if temp.empty:
        continue

    temp['sim']  = temp['userId'].map(
        neigh.set_index('OtherUserId')['DS']
    )
    temp['mu_v'] = temp['userId'].map(user_mean)
    temp.dropna(inplace=True)

    temp['wd'] = temp['sim'] * (temp['rating'] - temp['mu_v'])

    g = temp.groupby('movieId').agg(
        num=('wd','sum'),
        den=('sim', lambda x: np.abs(x).sum())
    )

    g['PredRating'] = user_mean[u] + g['num'] / g['den']
    g['PredRating'].fillna(item_mean, inplace=True)
    g['PredRating'].fillna(user_mean[u], inplace=True)
    g['PredRating'].fillna(global_mean, inplace=True)

    g = g.reset_index()
    g['TargetUser']   = label
    g['TargetUserId'] = u

    pred_list.append(
        g[['TargetUser','TargetUserId','movieId','PredRating']]
    )

pred_df_ds = pd.concat(pred_list, ignore_index=True)

print(pred_df_ds.head())

pred_df_ds.to_csv(
    r"D:\is\sectionTWO_case_study3_predictions_DS.csv",
    index=False
)


DS predicting for U1 (userId=69251)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

DS predicting for U2 (userId=69481)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

DS predicting for U3 (userId=67075)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(item_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  g['PredRating'].fillna(user_mean[u], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

  TargetUser  TargetUserId  movieId  PredRating
0         U1         69251        2    2.458276
1         U1         69251        3    2.415207
2         U1         69251        4    2.176109
3         U1         69251        5    2.336873
4         U1         69251        6    3.176577


In [47]:
neighbors_ds = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_top20_neighbors_DS.csv"
)

In [None]:
user_mean = df.groupby('userId')['rating'].mean()
item_mean = df.groupby('movieId')['rating'].mean()
global_mean = df['rating'].mean()

In [48]:
#7- Compare similarity lists from steps 2 and 5. 
pcc_neighbors = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_top20_neighbors.csv"
)

ds_neighbors = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_top20_neighbors_DS.csv"
)

summary_rows = []

for label in sorted(pcc_neighbors['TargetUser'].unique()):
    pcc_sub = pcc_neighbors[pcc_neighbors['TargetUser'] == label]
    ds_sub  = ds_neighbors[ds_neighbors['TargetUser'] == label]

    pcc_set = set(pcc_sub['OtherUserId'])
    ds_set  = set(ds_sub['OtherUserId'])

    common = pcc_set & ds_set
    only_pcc = pcc_set - ds_set
    only_ds  = ds_set - pcc_set

    summary_rows.append({
        "TargetUser": label,
        "PCC_count": len(pcc_set),
        "DS_count": len(ds_set),
        "Common_neighbors": len(common),
        "Only_in_PCC": len(only_pcc),
        "Only_in_DS": len(only_ds)
    })

summary_df = pd.DataFrame(summary_rows)

print("\nSummary comparison (PCC vs DS neighbors):")
print(summary_df)



Summary comparison (PCC vs DS neighbors):
  TargetUser  PCC_count  DS_count  Common_neighbors  Only_in_PCC  Only_in_DS
0         U1      16478     16478              1291        15187       15187
1         U2      23826     23826              4217        19609       19609
2         U3      22898     22898              3603        19295       19295


In [49]:
import pandas as pd

pred_pcc = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_predictions_all_items_pcc.csv"
)

pred_pcc = pred_pcc.rename(columns={
    "PredRating": "PredRating_PCC"
})

pred_ds = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_predictions_DS.csv"
)

pred_ds = pred_ds.rename(columns={
    "PredRating": "PredRating_DS"
})

pred_compare = pred_pcc.merge(
    pred_ds,
    on=["TargetUser", "TargetUserId", "movieId"],
    how="inner"
)

pred_compare["Diff_DS_minus_PCC"] = (
    pred_compare["PredRating_DS"]
    - pred_compare["PredRating_PCC"]
)

print("Comparison PCC vs DS:")
print(pred_compare.head())

pred_compare.to_csv(
    r"D:\is\sectionTWO_case_study3_prediction_comparison.csv",
    index=False
)



Comparison PCC vs DS:
  TargetUser  TargetUserId  movieId  PredRating_PCC  PredRating_DS  \
0         U1         69251        2        2.604532       2.458276   
1         U1         69251        3        2.422639       2.415207   
2         U1         69251        4        2.187283       2.176109   
3         U1         69251        5        2.336086       2.336873   
4         U1         69251        6        2.985428       3.176577   

   Diff_DS_minus_PCC  
0          -0.146256  
1          -0.007432  
2          -0.011174  
3           0.000787  
4           0.191150  


In [None]:
#9.Find users whose PCC < 0 but Cosine > 0, and inspect them
raw_cos = pd.read_csv(
    r"D:\is\section2_neighborhood_cf\user_raw_cosine_similarity.csv"
)
print("Raw cosine cols:", raw_cos.columns)

raw_cos = raw_cos.rename(columns={
    raw_cos.columns[0]: "TargetUserId",
    raw_cos.columns[1]: "OtherUserId",
    raw_cos.columns[2]: "COS"
})

# load PCC (case study 3) 
pcc_df = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_pcc.csv"
)
pcc_df = pcc_df[["TargetUserId", "OtherUserId", "PCC"]]

# merge cosine + PCC on user pair 
sim_compare = raw_cos.merge(
    pcc_df,
    on=["TargetUserId", "OtherUserId"],
    how="inner"
)

# pairs where cosine > 0 but PCC < 0 
neg_pcc_pos_cos = sim_compare[
    (sim_compare["COS"] > 0) &
    (sim_compare["PCC"] < 0)
]

print("\nPairs with positive Cosine but negative PCC:")
print(neg_pcc_pos_cos.head())

if not neg_pcc_pos_cos.empty:
    pair = neg_pcc_pos_cos.iloc[0]
    u = pair["TargetUserId"]
    v = pair["OtherUserId"]
    print(f"\nExample pair TargetUserId={u}, OtherUserId={v}")

    common = df[df.userId.isin([u, v])]
    pivot = common.pivot_table(
        index="movieId",
        columns="userId",
        values="rating"
    ).dropna()

    print("\nCommon-item ratings for this pair:")
    print(pivot.head())


Raw cosine cols: Index(['target_user', 'other_user', 'similarity'], dtype='object')

Pairs with positive Cosine but negative PCC:
    TargetUserId  OtherUserId     COS       PCC
0          69251            1  0.8934 -0.302416
9          69251           10  0.9152 -0.322749
18         69251           19  0.9121 -0.188982
30         69251           32  0.8716 -0.270031
31         69251           33  0.8108 -0.037037

Example pair TargetUserId=69251.0, OtherUserId=1.0

Common-item ratings for this pair:
userId   1      69251
movieId              
1079       4.0    1.0
2100       4.0    3.0
2174       4.0    3.0
2253       3.5    1.0
2628       4.0    1.0


In [None]:
#10.Does PCC make sense when overlap ≤ 20% of items?
# ratings per user
user_n = df.groupby("userId")["movieId"].nunique()

# PCC + Overlap from your DF/DS file
df_ds = pd.read_csv(
    r"D:\is\sectionTWO_case_study3_PCC_DF_DS.csv"
)  

# add how many items each user rated
df_ds["n_u"] = df_ds["TargetUserId"].map(user_n)
df_ds["n_v"] = df_ds["OtherUserId"].map(user_n)

# overlap ratios for each user
df_ds["overlap_ratio_u"] = df_ds["Overlap"] / df_ds["n_u"]
df_ds["overlap_ratio_v"] = df_ds["Overlap"] / df_ds["n_v"]

# pairs where each user shares <= 20% of their items with the other
low_overlap_pairs = df_ds[
    (df_ds["overlap_ratio_u"] <= 0.2) &
    (df_ds["overlap_ratio_v"] <= 0.2)
]

print("\nLow-overlap pairs (≤20% each):")
print(
    low_overlap_pairs[
        ["TargetUserId", "OtherUserId",
         "Overlap", "overlap_ratio_u", "overlap_ratio_v", "PCC"]
    ].head()
)

print("\nNumber of low-overlap pairs:", len(low_overlap_pairs))
print("Average |PCC| for low-overlap pairs:",
      low_overlap_pairs["PCC"].abs().mean())



Low-overlap pairs (≤20% each):
   TargetUserId  OtherUserId  Overlap  overlap_ratio_u  overlap_ratio_v  \
0         69251            1      9.0         0.062937         0.051429   
1         69251            2      4.0         0.027972         0.065574   
2         69251            3     16.0         0.111888         0.085561   
3         69251            4      4.0         0.027972         0.142857   
4         69251            5      8.0         0.055944         0.121212   

        PCC  
0 -0.302416  
1  0.471405  
2  0.187205  
3  0.730297  
4  0.346569  

Number of low-overlap pairs: 118724
Average |PCC| for low-overlap pairs: 0.3855910157032506
