In [8]:
import os
import re
from collections import defaultdict


import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.metrics import (
roc_auc_score,
roc_curve,
confusion_matrix,
accuracy_score,
precision_score,
recall_score,
f1_score,
classification_report,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline


import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style="whitegrid")

In [9]:
# ----------------------
# Configuration
# ----------------------
MOVIES_FILE = '/kaggle/input/datatrw/data2.csv'
RATINGS_FILE = '/kaggle/input/datatrw/data3.csv'
USERS_FILE = '/kaggle/input/datatrw/data4.csv'
OUTPUT_DIR = '/kaggle/working/'

RATING_THRESHOLD = 4 # rating >= 4 => liked (positive class)
TEST_SIZE = 0.2
RANDOM_STATE = 42


os.makedirs(OUTPUT_DIR, exist_ok=True)

In [10]:
# ----------------------
# Utility functions
# ----------------------

def load_movies(path):
    """Load movies file; expects MovieID::Title::Genres"""
    df = pd.read_csv(path, sep="::", engine="python", header=None, names=["movieId", "title", "genres"])
    # extract year from title if available
    def extract_year(t):
        m = re.search(r"\((\d{4})\)", str(t))
        return int(m.group(1)) if m else np.nan

    df["year"] = df["title"].apply(extract_year)
    return df


def load_ratings(path):
    """Load ratings file; expects UserID::MovieID::Rating::Timestamp"""
    df = pd.read_csv(path, sep="::", engine="python", header=None, names=["userId", "movieId", "rating", "timestamp"])
    return df


def load_users(path):
    """Load users file; expects UserID::Gender::Age::Occupation::Zip-code"""
    df = pd.read_csv(path, sep="::", engine="python", header=None, names=["userId", "gender", "age", "occupation", "zip"])
    return df

In [11]:
# ----------------------
# Load data
# ----------------------
print("Loading data...")
movies = load_movies(MOVIES_FILE)
ratings = load_ratings(RATINGS_FILE)
users = load_users(USERS_FILE)

print(f"movies: {movies.shape}, ratings: {ratings.shape}, users: {users.shape}")

Loading data...
movies: (3883, 4), ratings: (1000209, 4), users: (6040, 5)


In [35]:
#import pandas as pd
#import matplotlib.pyplot as plt
#import os 

# ---------- Basic EDA & Preprocessing ----------
def basic_eda(movies_path, ratings_path, OUTPUT_DIR):
    # Load datasets correctly (double-colon separated)
    movies = pd.read_csv(
        movies_path,
        sep="::",
        names=["movieId", "title", "genres"],
        engine="python"
    )
    ratings = pd.read_csv(
        ratings_path,
        sep="::",
        names=["userId", "movieId", "rating", "timestamp"],
        engine="python"
    )

    # Ensure movieId types match for merging
    movies["movieId"] = movies["movieId"].astype(int)
    ratings["movieId"] = ratings["movieId"].astype(int)

    # Create output directory if not exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # --------- Basic statistics ----------
    stats = {
        "num_movies": movies["movieId"].nunique(),
        "num_users": ratings["userId"].nunique(),
        "num_ratings": len(ratings),
        "rating_min": ratings["rating"].min(),
        "rating_max": ratings["rating"].max(),
        "rating_mean": ratings["rating"].mean(),
        "rating_median": ratings["rating"].median()
    }

    pd.Series(stats).to_frame("value").to_csv(
        os.path.join(OUTPUT_DIR, "basic_stats.csv")
    )

    # --------- Rating distribution ----------
    plt.figure(figsize=(6, 4))
    ratings["rating"].value_counts().sort_index().plot(kind="bar", color="skyblue")
    plt.title("Rating Distribution")
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "rating_distribution.png"))
    plt.close()

    # --------- Top movies by rating count ----------
    top_movies_by_count = (
        ratings.groupby("movieId").size().reset_index(name="count").sort_values(by="count", ascending=False).head(20)
    )

    # Merge with movie names -------------------
    
    top_movies = movies.merge(top_movies_by_count, on="movieId", how="inner")
    top_movies.to_csv(os.path.join(OUTPUT_DIR, "top_movies_by_count.csv"), index=False)

    # Plot-------------------------------------
    
    plt.figure(figsize=(10, 5))
    plt.barh(top_movies["title"], top_movies["count"], color="lightcoral")
    plt.title("Top 20 Movies by Rating Count")
    plt.xlabel("Number of Ratings")
    plt.ylabel("Movie Title")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "top_movies_by_count.png"))
    plt.close()

    # --------- Ratings per user ----------
    user_rating_count = (
        ratings.groupby("userId").size().reset_index(name="count")
    )
    plt.figure(figsize=(6, 4))
    plt.hist(user_rating_count["count"], bins=50, color="lightgreen")
    plt.title("Ratings per User Distribution")
    plt.xlabel("Number of Ratings")
    plt.ylabel("Users Count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "ratings_per_user.png"))
    plt.close()

    print(f"âœ… EDA output saved to '{OUTPUT_DIR}' successfully!")


# ---------- Example usage ----------
# basic_eda("path_to/movies.dat", "path_to/ratings.dat", "OUTPUT")

In [15]:
# ---------- Merge and Feature Engineering (Clean + Optimized) ----------
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

print("ðŸ”§ Cleaning, merging, and engineering features...")

# --- Clean unwanted quotes or spaces in all object columns ---
for df_ in [users, movies, ratings]:
    for col in df_.select_dtypes(include="object").columns:
        df_[col] = df_[col].astype(str).str.strip().str.replace('"', '', regex=False)

# --- Ensure consistent numeric column types before merging ---
ratings["movieId"] = ratings["movieId"].astype(int)
movies["movieId"] = movies["movieId"].astype(int)
ratings["userId"] = ratings["userId"].astype(int)
users["userId"] = users["userId"].astype(int)
if "age" in users.columns:
    users["age"] = users["age"].fillna(0).astype(int)

# --- Handle missing genres and create one-hot encoded genre features ---
movies["genres"] = movies["genres"].fillna("")
genre_dummies = movies["genres"].str.get_dummies(sep="|").add_prefix("genre_")
movies = pd.concat([movies, genre_dummies], axis=1)

# --- Merge ratings + movies + users ---
df = (
    ratings
    .merge(movies, on="movieId", how="left")
    .merge(users, on="userId", how="left")
)

# --- Create binary target label (liked = 1 if rating >= threshold) ---
RATING_THRESHOLD = 3.5
df["liked"] = (df["rating"] >= RATING_THRESHOLD).astype(int)

# --- Encode gender ---
df["gender"] = df["gender"].fillna("M")
le_gender = LabelEncoder()
df["gender_enc"] = le_gender.fit_transform(df["gender"])

# --- Encode occupation (if categorical) ---
if df["occupation"].dtype == "object":
    df["occupation"] = df["occupation"].fillna("Unknown")
    le_occ = LabelEncoder()
    df["occupation_enc"] = le_occ.fit_transform(df["occupation"])
    occ_col = "occupation_enc"
else:
    df["occupation"] = df["occupation"].fillna(0).astype(int)
    occ_col = "occupation"

# --- Compute movie and user rating counts ---
movie_rating_count = df.groupby("movieId")["rating"].count().reset_index(name="movie_rating_count")
user_rating_count = df.groupby("userId")["rating"].count().reset_index(name="user_rating_count")

df = (
    df.merge(movie_rating_count, on="movieId", how="left")
      .merge(user_rating_count, on="userId", how="left")
)

# --- Define feature columns ---
genre_cols = [c for c in df.columns if c.startswith("genre_")]
feature_cols = ["gender_enc", "age", occ_col, "movie_rating_count", "user_rating_count"] + genre_cols

# --- Fill missing numeric values and scale features ---
for col in feature_cols:
    if df[col].dtype == "object":
        df[col] = df[col].fillna("Unknown")
    else:
        df[col] = df[col].fillna(0)

scaler = StandardScaler()
df[["movie_rating_count", "user_rating_count", "age"]] = scaler.fit_transform(
    df[["movie_rating_count", "user_rating_count", "age"]]
)

# --- Prepare final X and y ---
X = df[feature_cols]
y = df["liked"]

print("âœ… Feature engineering completed successfully!")
print(f"Final dataset shape: {df.shape}")
print(f"Feature columns ({len(feature_cols)}): {feature_cols[:10]} ...")
print(f"Target distribution:\n{df['liked'].value_counts(normalize=True)}")


ðŸ”§ Cleaning, merging, and engineering features...
âœ… Feature engineering completed successfully!
Final dataset shape: (1000209, 189)
Feature columns (179): ['gender_enc', 'age', 'occupation', 'movie_rating_count', 'user_rating_count', 'genre_Action', 'genre_Action,,,,,,', 'genre_Action,,,,,,,', 'genre_Action,,,,,,,,', 'genre_Action,,,,,,,,,'] ...
Target distribution:
liked
1    0.575161
0    0.424839
Name: proportion, dtype: float64


In [16]:
# ----------------------
# Train/test split
# ----------------------
print("Splitting into train/test...")
train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df["liked"])

print("Train:", train_df.shape, "Test:", test_df.shape)

# ----------------------
# Baseline: Popularity predictor
# ----------------------
print("Training popularity baseline...")
most_common_label = train_df["liked"].mode()[0]

def predict_popularity(df_):
    return np.full(len(df_), most_common_label)

# evaluate function

def evaluate_binary(y_true, y_pred, y_score=None, model_name="model"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_score) if y_score is not None else None
    cm = confusion_matrix(y_true, y_pred)
    return {"model": model_name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc, "confusion_matrix": cm}

results = []

pop_pred = predict_popularity(test_df)
# for popularity we don't have a score; we'll use proportion of positives in train as score
pop_score = np.full(len(test_df), train_df["liked"].mean())
res_pop = evaluate_binary(test_df["liked"], pop_pred, y_score=pop_score, model_name="popularity")
results.append(res_pop)

# save confusion matrix image
cm = res_pop["confusion_matrix"]
plt.figure()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Popularity baseline Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(os.path.join(OUTPUT_DIR, "cm_popularity.png"))
plt.close()


Splitting into train/test...
Train: (800167, 189) Test: (200042, 189)
Training popularity baseline...


In [25]:
#import numpy as np
#import pandas as pd
#from sklearn.neighbors import NearestNeighbors

# --------------------------------------------------------------------------
                        #collaborative filtering
# --------------------------------------------------------------------------

print("Building user-item matrix for CF...")
user_item = train_df.pivot_table(index="userId", columns="movieId", values="rating")
user_item_filled = user_item.fillna(0)

print("Training user-based kNN model (cosine similarity)...")
user_nn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=30, n_jobs=-1)
user_nn.fit(user_item_filled.values)

# Precompute neighbors for all users
print("Precomputing neighbors...")
distances, neighbor_indices = user_nn.kneighbors(user_item_filled.values, n_neighbors=30)

# Map userId to row index
user_index_map = {uid: idx for idx, uid in enumerate(user_item_filled.index)}

# Precompute movie average ratings for fallback
movie_stats = train_df.groupby('movieId')['rating'].agg(movie_avg_rating='mean').reset_index()
movie_avg_map = movie_stats.set_index('movieId')['movie_avg_rating'].to_dict()
global_mean = train_df['rating'].mean()

print("Predicting test set with fast user-based CF...")

def fast_predict_user_cf(test_df, k=20):
    preds = np.zeros(len(test_df))
    
    for i, (uid, mid) in enumerate(zip(test_df['userId'], test_df['movieId'])):
        if uid not in user_index_map:
            # Cold-start user
            preds[i] = movie_avg_map.get(mid, global_mean)
            continue
        
        uidx = user_index_map[uid]
        neighbors = neighbor_indices[uidx][:k]
        neighbor_user_ids = user_item_filled.index[neighbors]

        # Collect neighbor ratings for this movie
        ratings = []
        for nu in neighbor_user_ids:
            if mid in user_item.columns and not pd.isna(user_item.loc[nu, mid]):
                ratings.append(user_item.loc[nu, mid])
        
        if ratings:
            preds[i] = np.mean(ratings)
        else:
            # Cold-start movie
            preds[i] = movie_avg_map.get(mid, global_mean)
    
    return preds

# Run fast prediction
user_cf_pred_ratings = fast_predict_user_cf(test_df)
user_cf_pred_labels = (user_cf_pred_ratings >= RATING_THRESHOLD).astype(int)

# Evaluate as before
res_user_cf = evaluate_binary(test_df['liked'], user_cf_pred_labels, 
                              y_score=user_cf_pred_ratings, model_name='user_cf')
results.append(res_user_cf)


Building user-item matrix for CF...
Training user-based kNN model (cosine similarity)...
Precomputing neighbors...
Predicting test set with fast user-based CF...


In [34]:
# ----------------------------
# ROC Curve & Confusion Matrix for all models
# ----------------------------
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix

# List of models and predictions
models_info = [
    {
        "name": "Popularity Baseline",
        "y_true": test_df['liked'],
        "y_pred": np.full(len(test_df), most_common_label),
        "y_score": np.full(len(test_df), train_df['liked'].mean())  # probability estimate
    },
    {
        "name": "User-CF",
        "y_true": test_df['liked'],
        "y_pred": user_cf_pred_labels,
        "y_score": user_cf_pred_ratings
    },
    # Add other models here if available, e.g., RandomForest, SVD
    # Example for RandomForest:
    # {
    #     "name": "RandomForest",
    #     "y_true": y_test,
    #     "y_pred": rf_preds,
    #     "y_score": rf_probs
    # },
]

for model in models_info:
    name = model['name']
    y_true = model['y_true']
    y_pred = model['y_pred']
    y_score = model['y_score']

    # --- ROC Curve ---
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{name} ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(OUTPUT_DIR, f'roc_{name.replace(" ", "_").lower()}.png'))
    plt.close()

    # --- Confusion Matrix ---
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'{name} Confusion Matrix')
    plt.savefig(os.path.join(OUTPUT_DIR, f'cm_{name.replace(" ", "_").lower()}.png'))
    plt.close()

print("âœ… ROC curves and confusion matrices saved for all models.")


âœ… ROC curves and confusion matrices saved for all models.


In [30]:
# ----------------------
# Comparison and summary
# ----------------------
print("Compiling results and saving summary...")

summary_rows = []
for r in results:
    summary_rows.append({
        'model': r['model'],
        'accuracy': r['accuracy'],
        'precision': r['precision'],
        'recall': r['recall'],
        'f1': r['f1'],
        'auc': r['auc'] if r['auc'] is not None else np.nan,
    })
summary_df = pd.DataFrame(summary_rows).sort_values(by='f1', ascending=False)
summary_df.to_csv(os.path.join(OUTPUT_DIR, 'model_comparison.csv'), index=False)

plt.figure(figsize=(8, 4))
summary_df.set_index('model')[['accuracy', 'precision', 'recall', 'f1', 'auc']].plot(kind='bar')
plt.title('Model comparison')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'model_comparison.png'))
plt.close()

print("All outputs saved to the 'outputs' folder.")
print(summary_df)

# End of script
print('Done')


Compiling results and saving summary...
All outputs saved to the 'outputs' folder.
        model  accuracy  precision    recall        f1       auc
0  popularity  0.575159   0.575159  1.000000  0.730287  0.500000
1     user_cf  0.671964   0.698628  0.755615  0.726005  0.717798
2     user_cf  0.671964   0.698628  0.755615  0.726005  0.717798
Done


<Figure size 800x400 with 0 Axes>