# PCA with Mean Filling
Eyad Medhat 221100279/ Hady Aly 221101190 / Mohamed Mahfouz 221101743 / Omar Mady 221100745

Task 1: Calculate Average Rating for Target Items I1 and I2

In [2]:
from utils import *

In [3]:
# 1. Load the Target Items (I1, I2) from the previous results
target_items_path = os.path.join('..', 'results', 'tables', 'lowest_two_rateditems.csv')
print(f"Loading target items from: {target_items_path}")

if os.path.exists(target_items_path):
    target_items_df = pd.read_csv(target_items_path)
    print("Target Items Loaded:")
    print(target_items_df)
    
    # Access by position to ensure we get the first two rows regardless of index
    I1 = target_items_df.iloc[0]['movieId']
    I2 = target_items_df.iloc[1]['movieId']
    print(f"Target Item 1 (I1): {I1}")
    print(f"Target Item 2 (I2): {I2}")
else:
    print("ERROR: Target items file not found. Make sure previous steps were run.")

Loading target items from: ..\results\tables\lowest_two_rateditems.csv
Target Items Loaded:
   movieId  mean_rating_per_movie  rating_count_per_movie
0     1556               1.919431                     422
1     1499               2.059603                     453
Target Item 1 (I1): 1556.0
Target Item 2 (I2): 1499.0


In [4]:
# 2. Load the Ratings Data
ratings_df = load_data()

if ratings_df is None:
    print("Trying raw ratings...")
    ratings_df = utils.load_data(os.path.join('ml-20m', 'ratings.csv'))

if ratings_df is not None:
    print(f"Ratings loaded. Shape: {ratings_df.shape}")
else:
    print("FAILED to load ratings data.")

 Found cached sample at: ..\data\ml-20m\ratings_cleaned_sampled.csv
Ratings loaded. Shape: (1000000, 3)


In [6]:
# 3. Calculate Average Rating for I1 and I2
if ratings_df is not None and 'I1' in locals():
    # Filter for I1
    i1_ratings = ratings_df[ratings_df['movieId'] == I1]
    i1_mean = i1_ratings['rating'].mean()
    i1_count = len(i1_ratings)
    
    # Filter for I2
    i2_ratings = ratings_df[ratings_df['movieId'] == I2]
    i2_mean = i2_ratings['rating'].mean()
    i2_count = len(i2_ratings)
    
    print(f"\n--- Calculated Stats ---")
    print(f"Item I1 (ID: {I1}): Mean Rating = {i1_mean:.4f}, Count = {i1_count}")
    print(f"Item I2 (ID: {I2}): Mean Rating = {i2_mean:.4f}, Count = {i2_count}")
    
    # Verification against loaded values
    print(f"\n--- Verification ---")
    print(f"Stored I1 Mean: {target_items_df.iloc[0]['mean_rating_per_movie']}")
    print(f"Stored I2 Mean: {target_items_df.iloc[1]['mean_rating_per_movie']}")
    
    # --- SAVE RESULT ---
    task1_data = [
        {'movieId': I1, 'mean_rating': i1_mean, 'count': i1_count},
        {'movieId': I2, 'mean_rating': i2_mean, 'count': i2_count}
    ]
    task1_df = pd.DataFrame(task1_data)
    save_csv(task1_df, 'task3.2.1.csv')
    
else:
    print("Cannot calculate stats: Missing data.")


--- Calculated Stats ---
Item I1 (ID: 1556.0): Mean Rating = 1.9194, Count = 422
Item I2 (ID: 1499.0): Mean Rating = 2.0596, Count = 453

--- Verification ---
Stored I1 Mean: 1.919431279620853
Stored I2 Mean: 2.0596026490066226
    Saved CSV: tables/task3.2.1.csv


## Task 2: Mean Filling and Dataset Augmentation
Fill missing ratings for Target Items I1 and I2 with their mean value (1.0) and save the augmented dataset.

In [7]:
augmented_df = None # Initialize
if ratings_df is not None and 'I1' in locals():
    print("Augmenting dataset with filled ratings for I1 and I2...")
    
    all_users = ratings_df['userId'].unique()
    print(f"Total unique users: {len(all_users)}")
    
    new_rows = []
    
    # --- Process I1 ---
    # Find users who rated I1
    users_rated_i1 = set(ratings_df[ratings_df['movieId'] == I1]['userId'].unique())
    # Find missing
    users_missing_i1 = [u for u in all_users if u not in users_rated_i1]
    print(f"Users missing rating for I1: {len(users_missing_i1)}")
    
    # Create rows for missing I1
    for u in users_missing_i1:
        new_rows.append({'userId': u, 'movieId': I1, 'rating': i1_mean})
        
    # --- Process I2 ---
    # Find users who rated I2
    users_rated_i2 = set(ratings_df[ratings_df['movieId'] == I2]['userId'].unique())
    # Find missing
    users_missing_i2 = [u for u in all_users if u not in users_rated_i2]
    print(f"Users missing rating for I2: {len(users_missing_i2)}")
    
    # Create rows for missing I2
    for u in users_missing_i2:
        new_rows.append({'userId': u, 'movieId': I2, 'rating': i2_mean})
    
    print(f"Creating dataframe for {len(new_rows)} new ratings...")
    new_ratings_df = pd.DataFrame(new_rows)
    
    # Concatenate
    print("Appending to original dataset...")
    augmented_df = pd.concat([ratings_df, new_ratings_df], ignore_index=True)
    
    # Sort for tidiness (optional)
    augmented_df.sort_values(by=['userId', 'movieId'], inplace=True)
    
    print(f"Original Shape: {ratings_df.shape}")
    print(f"Augmented Shape: {augmented_df.shape}")
    
    # Verify counts
    final_i1_count = len(augmented_df[augmented_df['movieId'] == I1])
    final_i2_count = len(augmented_df[augmented_df['movieId'] == I2])
    print(f"Final Count I1: {final_i1_count} (Should be {len(all_users)}) ")
    print(f"Final Count I2: {final_i2_count} (Should be {len(all_users)}) ")
    
    # --- SAVE RESULT ---
    # Saving to Results folder as requested: 'ratings_filled_targets.csv'
    output_filename = 'ratings_filled_targets.csv'
    output_path = os.path.join('..', 'results', 'tables', output_filename)
    
    # Ensure dir exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    print(f"Saving augmented dataset to {output_path}...")
    augmented_df.to_csv(output_path, index=False)
    print("Save Complete.")
    
else:
    print("Skipping augmentation due to missing data.")

Augmenting dataset with filled ratings for I1 and I2...
Total unique users: 96345
Users missing rating for I1: 95923
Users missing rating for I2: 95892
Creating dataframe for 191815 new ratings...
Appending to original dataset...
Original Shape: (1000000, 3)
Augmented Shape: (1191815, 3)
Final Count I1: 96345 (Should be 96345) 
Final Count I2: 96345 (Should be 96345) 
Saving augmented dataset to ..\results\tables\ratings_filled_targets.csv...
Save Complete.


## Task 3: Calculate Average Ratings from Augmented Data
Calculate the mean rating for each item in the new augmented dataset.

In [9]:
final_stats = None # Initialize
if augmented_df is not None:
    print("Calculating stats for augmented dataset...")
    
    # Calculate stats
    final_stats = augmented_df.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()
    final_stats.rename(columns={'mean': 'mean_rating', 'count': 'rating_count'}, inplace=True)
    
    print("Stats calculated.")
    print(final_stats.head())
    
    # Verification for I1 and I2
    if 'I1' in locals():
        i1_final = final_stats[final_stats['movieId'] == I1]
        print(f"\nFinal Stats I1:\n{i1_final}")
        
        i2_final = final_stats[final_stats['movieId'] == I2]
        print(f"Final Stats I2:\n{i2_final}")
        
    # --- SAVE RESULT ---
    print("Saving final item stats...")
    save_csv(final_stats, 'task3.2.3_final_item_stats.csv')
    print("Task 3 Complete.")
    
else:
    print("Augmented dataframe not available.")

Calculating stats for augmented dataset...
Stats calculated.
   movieId  mean_rating  rating_count
0      1.0     3.915355          3875
1      2.0     3.202874          1740
2      3.0     3.209073          1014
3      5.0     3.061538           975
4      6.0     3.770856          1846

Final Stats I1:
     movieId  mean_rating  rating_count
429   1556.0     1.919431         96345
Final Stats I2:
     movieId  mean_rating  rating_count
422   1499.0     2.059603         96345
Saving final item stats...
    Saved CSV: tables/task3.2.3_final_item_stats.csv
Task 3 Complete.


## Task 4: Calculate Diffrence between Ratings and Mean Rating
Load the augmented ratings and final item stats from CSVs, then calculate the difference.

In [11]:
# Load data from results folder
augmented_csv_path = os.path.join('..', 'results', 'tables', 'ratings_filled_targets.csv')
stats_csv_path = os.path.join('..', 'results', 'tables', 'task3.2.3_final_item_stats.csv')

print("Loading files for Task 4...")

has_data_task4 = True
if os.path.exists(augmented_csv_path):
    # Use float32 for ratings to save memory if dataset is large
    # Reading only necessary cols if we were pipelining, but here we read all
    task4_ratings = pd.read_csv(augmented_csv_path)
    print(f"Loaded Augmented Ratings. Shape: {task4_ratings.shape}")
else:
    print(f"Error: {augmented_csv_path} not found.")
    has_data_task4 = False
    
if os.path.exists(stats_csv_path):
    task4_stats = pd.read_csv(stats_csv_path)
    print(f"Loaded Item Stats. Shape: {task4_stats.shape}")
else:
    print(f"Error: {stats_csv_path} not found.")
    has_data_task4 = False

if has_data_task4:
    print("Calculating rating differences (Centering)...")
    
    # Merge
    # We want merged_df to contain 'mean_rating' from task4_stats matching 'movieId'
    merged_df = task4_ratings.merge(task4_stats[['movieId', 'mean_rating']], on='movieId', how='left')
    
    # Calculate difference
    merged_df['rating_diff'] = merged_df['rating'] - merged_df['mean_rating']
    
    print("Difference calculated.")
    print(merged_df[['userId', 'movieId', 'rating', 'mean_rating', 'rating_diff']].head())
    
    # Verify centering (Mean difference should be close to 0)
    avg_diff = merged_df['rating_diff'].mean()
    print(f"\nAverage Global Difference (should be ~0): {avg_diff:.6f}")
    
    # --- SAVE RESULT ---
    print("Saving centered ratings...")
    save_csv(merged_df, 'task3.2.4_centered_ratings.csv')
    print("Task 4 Complete.")
    
else:
    print("Cannot proceed with Task 4 due to missing files.")

Loading files for Task 4...
Loaded Augmented Ratings. Shape: (1191815, 3)
Loaded Item Stats. Shape: (1000, 3)
Calculating rating differences (Centering)...
Difference calculated.
   userId  movieId  rating  mean_rating  rating_diff
0       1     32.0     4.0     3.881264     0.118736
1       1    337.0     4.0     3.755682     0.244318
2       1   1193.0     4.0     4.191716    -0.191716
3       1   1261.0     4.0     3.848649     0.151351
4       1   1370.0     3.0     3.454844    -0.454844

Average Global Difference (should be ~0): -0.000000
Saving centered ratings...
    Saved CSV: tables/task3.2.4_centered_ratings.csv
Task 4 Complete.


## Task 5: Calculate Partial Covariance (Targets vs All)
Compute the covariance where we calculate Cov(Target, Item_j) for all Items j.
We use the helper function from utils to do this efficiently.

In [14]:
centered_ratings_path = os.path.join('..', 'results', 'tables', 'task3.2.4_centered_ratings.csv')
target_items_path = os.path.join('..', 'results', 'tables', 'lowest_two_rateditems.csv')

if os.path.exists(centered_ratings_path) and os.path.exists(target_items_path):
    # 1. Load Target Items to identify I1 and I2
    target_items_df = pd.read_csv(target_items_path)
    I1 = target_items_df.iloc[0]['movieId']
    I2 = target_items_df.iloc[1]['movieId']
    print(f"Target Items IDs: {I1}, {I2}")

    # 2. Load Centered Ratings
    print("Loading dataset...")
    # Need all data to calculate correlation against everything
    df = pd.read_csv(centered_ratings_path, usecols=['userId', 'movieId', 'rating_diff'])
    
    # 3. Calculate Target Covariances using Utils
    print("Calculating partial covariance matrix (2 x All Items)...")
    
    targets = [I1, I2]
    partial_cov_df = calculate_target_covariance(df, targets)
    
    print("Calculation Complete.")
    print(f"Result Shape: {partial_cov_df.shape}")
    print("Sample (first 5 cols):")
    print(partial_cov_df.iloc[:, :5])
    
    # Task 5 & 6 effectively merge here because Task 6 was just "Generate Matrix"
    # But to separate them as requested:
    
    # Task 5 Save: We can save this intermediate result
    output_filename = '3.2.5_target_only_covariances.csv'
    output_path = os.path.join('..', 'results', 'tables', output_filename)
    print(f"Saving intermediate partial covariance to {output_path}...")
    partial_cov_df.to_csv(output_path)
    print("Task 5 Complete.")
    
else:
    print("Required files not found.")

Target Items IDs: 1556.0, 1499.0
Loading dataset...
Calculating partial covariance matrix (2 x All Items)...
Starting MANUAL covariance calculation...
Total N (Users): 96345
Total Items: 1000
Users who rated targets: 96345
Building efficient lookup dictionary...
Lookup built. Calculating sums...
Processing Target Item: 1556.0...
Processing Target Item: 1499.0...
Formatting results...
Calculation Complete.
Result Shape: (2, 1000)
Sample (first 5 cols):
             1.0      2.0       3.0       5.0       6.0
1556.0  0.000013  0.00007 -0.000016  0.000042  0.000027
1499.0  0.000058  0.00010  0.000024  0.000046  0.000014
Saving intermediate partial covariance to ..\results\tables\3.2.5_target_only_covariances.csv...
Task 5 Complete.


## Task 6: Reference Full Covariance Matrix Calculation
Calculates the FULL N x N covariance matrix for all items using sparse algebra and saves it as a compressed .npz file.

In [None]:
centered_path = os.path.join('..', 'results', 'tables', 'task3.2.4_centered_ratings.csv')

if os.path.exists(centered_path):
    print("Loading centered ratings for Full Matrix calculation...")
    df_full = pd.read_csv(centered_path, usecols=['userId', 'movieId', 'rating_diff'])
    
    print("Calculating Full Sparse Matrix (N x N)...")
    sparse_cov, movie_ids = utils.calculate_full_covariance_sparse(df_full)
    
    print(f"Matrix Shape: {sparse_cov.shape}")
    
    # Save Matrix (Sparse)
    out_path = os.path.join('..', 'results', 'tables', '3.2.6_full_covariance.npz')
    print(f"Saving Sparse Matrix to {out_path}...")
    sparse.save_npz(out_path, sparse_cov)
    
    # Save ID Mapping
    id_map_path = os.path.join('..', 'results', 'tables', '3.2.6_full_covariance_ids.csv')
    pd.DataFrame(movie_ids, columns=['movieId']).to_csv(id_map_path, index=False)
    
    # --- MASSIVE CSV WRITE WITH CHUNKING (PARTIAL 60%) ---
    full_csv_path = os.path.join('..', 'results', 'tables', '3.2.6_full_covariance_partial.csv')
    print(f"Writing PARTIAL (60%) Full Matrix to CSV: {full_csv_path}")
    print("Writing in chunks...")
    
    chunk_size = 1000
    num_items = sparse_cov.shape[0]
    limit_items = int(num_items * 0.60) # 60% Limit
    
    print(f"Total items: {num_items}. Writing first {limit_items} items.")
    
    start_time = time.time()
    
    for i in range(0, limit_items, chunk_size):
        end_i = min(i + chunk_size, limit_items)
        
        # Extract chunk and make dense
        chunk_dense = sparse_cov[i:end_i].toarray()
        
        # Create DF
        # Index is subset of IDs, Columns is ALL IDs
        chunk_index = movie_ids[i:end_i]
        chunk_df = pd.DataFrame(chunk_dense, index=chunk_index, columns=movie_ids)
        
        # Write mode: 'w' for first chunk, 'a' for others
        # Header: True for first chunk, False for others
        if i == 0:
            chunk_df.to_csv(full_csv_path, mode='w', header=True)
        else:
            chunk_df.to_csv(full_csv_path, mode='a', header=False)
            
        # Progress log
        if i % 3000 == 0:
            elapsed = time.time() - start_time
            print(f"  Written rows {i} to {end_i} / {limit_items} ({elapsed:.1f}s)")
    
    print(f"CSV Write Complete. Time: {time.time() - start_time:.1f}s")
    print("Task 6 Complete.")
    
else:
    print("Input file not found.")

Loading centered ratings for Full Matrix calculation...
Calculating Full Sparse Matrix (N x N)...
Preparing for Full Sparse Covariance Calculation...
Dimensions: 96345 Users x 1000 Items
Constructing User-Item Sparse Matrix...
Computing X.T @ X ... (This may take a moment)
Dividing by N-1...
Matrix Shape: (1000, 1000)
Saving Sparse Matrix to ..\results\tables\3.2.6_full_covariance.npz...


NameError: name 'sparse' is not defined