<a href="https://colab.research.google.com/github/Gireesha07/data-science-project/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# complete_recommender_autodetect_safe.py
import os
import glob
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

# ---------------------------
# Search for dataset files in common locations
# ---------------------------
search_dirs = [".", "/mnt/data"]
candidates = []
for d in search_dirs:
    if not os.path.exists(d):
        continue
    candidates += glob.glob(os.path.join(d, "*.csv"))
    candidates += glob.glob(os.path.join(d, "*.xlsx"))
    candidates += glob.glob(os.path.join(d, "*.xls"))

print("Searched folders:", search_dirs)
print("Found files:")
for f in candidates:
    print(" ", f)

# ---------------------------
# Heuristic detection of file role by sampling column names
# ---------------------------
def detect_role(path):
    try:
        if path.lower().endswith((".xls", ".xlsx")):
            df = pd.read_excel(path, nrows=3)
        else:
            df = pd.read_csv(path, nrows=3)
    except Exception:
        return None
    cols = [c.lower().replace(" ", "") for c in df.columns]
    if any(c in cols for c in ["userid","user_id","user"]) and any(c in cols for c in ["movieid","movie_id","movie"]) and any(c in cols for c in ["rating","score"]):
        return "ratings"
    if any(c in cols for c in ["movieid","movie_id","id"]) and any(c in cols for c in ["title","name"]):
        return "movies"
    if any(c in cols for c in ["userid","user_id","user"]) and any(c in cols for c in ["name","location","city","age"]):
        return "users"
    return None

detected = {"movies": None, "ratings": None, "users": None}
for f in candidates:
    role = detect_role(f)
    if role and detected.get(role) is None:
        detected[role] = f

# fallback name-based heuristics
if detected["ratings"] is None:
    for f in candidates:
        name = os.path.basename(f).lower()
        if name.startswith(("ratings","rating","interactions","data")) or "rating" in name:
            detected["ratings"] = f
            break
if detected["movies"] is None:
    for f in candidates:
        name = os.path.basename(f).lower()
        if name.startswith(("movies","movie","titles")) or "movie" in name:
            detected["movies"] = f
            break

print("\nAuto-detected files:")
print(" movies:", detected["movies"])
print(" ratings:", detected["ratings"])
print(" users:", detected["users"])

if detected["movies"] is None or detected["ratings"] is None:
    raise SystemExit("Could not automatically find both movies & ratings files. Place them in current dir or /mnt/data and name them clearly (movies.csv / ratings.csv).")

# ---------------------------
# Safe load function
# ---------------------------
def safe_load(path):
    if path.lower().endswith((".xls", ".xlsx")):
        return pd.read_excel(path)
    else:
        return pd.read_csv(path)

movies = safe_load(detected["movies"])
ratings = safe_load(detected["ratings"])
users = safe_load(detected["users"]) if detected["users"] else None

print("\nLoaded shapes -> movies:", getattr(movies, "shape", None), "ratings:", getattr(ratings, "shape", None), "users:", getattr(users, "shape", None))

# ---------------------------
# Normalize column names
# ---------------------------
def normalize_cols(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    colmap = {}
    for c in df.columns:
        lc = c.lower().replace(" ", "")
        if lc in ("userid","user_id","user"):
            colmap[c] = "userId"
        elif lc in ("movieid","movie_id","movie"):
            colmap[c] = "movieId"
        elif lc in ("rating","ratings","score"):
            colmap[c] = "rating"
        elif lc in ("timestamp","time","ts","date"):
            colmap[c] = "timestamp"
        elif lc in ("title","name"):
            colmap[c] = "title"
        elif lc in ("genres","genre"):
            colmap[c] = "genres"
        elif lc in ("location","city"):
            colmap[c] = "location"
    return df.rename(columns=colmap)

movies = normalize_cols(movies)
ratings = normalize_cols(ratings)
if users is not None:
    users = normalize_cols(users)

# Keep only relevant ratings columns
ratings = ratings[[c for c in ["userId","movieId","rating","timestamp"] if c in ratings.columns]].copy()

# ---------------------------
# Safe timestamp handling
# ---------------------------
def ensure_timestamp(df, col="timestamp"):
    if col not in df.columns:
        df[col] = np.random.randint(1577836800, 1704067200, size=len(df))
        return df
    # if numeric already, keep
    if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col]):
        return df
    # try parse to datetime then to unix seconds
    parsed = pd.to_datetime(df[col], errors='coerce')
    df[col] = (parsed - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    return df

ratings = ensure_timestamp(ratings, "timestamp")
ratings['timestamp'] = pd.to_numeric(ratings['timestamp'], errors='coerce')
missing_ts = ratings['timestamp'].isna().sum()
if missing_ts > 0:
    print(f"Filling {missing_ts} missing/invalid timestamps with random values.")
    ratings.loc[ratings['timestamp'].isna(), 'timestamp'] = np.random.randint(1577836800, 1704067200, size=missing_ts)
ratings['timestamp'] = ratings['timestamp'].astype(int)

print("\nRatings preview:")
print(ratings.head())

# ---------------------------
# Ensure movies mapping exists
# ---------------------------
if 'movieId' not in movies.columns or 'title' not in movies.columns:
    movies = pd.DataFrame({"movieId": sorted(ratings['movieId'].unique())})
    movies['title'] = movies['movieId'].apply(lambda x: f"Movie {int(x)}")

movie_titles = movies.set_index('movieId')['title'].to_dict()

# ---------------------------
# Train/test split (time-based)
# ---------------------------
ratings = ratings.sort_values('timestamp').reset_index(drop=True)
split_idx = int(len(ratings) * 0.8)
train = ratings.iloc[:split_idx].copy()
test = ratings.iloc[split_idx:].copy()
print(f"\nTrain size: {train.shape}, Test size: {test.shape}")

# ---------------------------
# Popularity baseline
# ---------------------------
popularity_rank = train.groupby('movieId').size().sort_values(ascending=False).index.tolist()
def recommend_popular(user_id, train_df=train, N=10):
    seen = set(train_df[train_df.userId==user_id].movieId.tolist())
    return [m for m in popularity_rank if m not in seen][:N]

# ---------------------------
# Build item-user and item similarity (full or reduced) safely
# ---------------------------
print("\nBuilding item-user matrix...")
item_user = train.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)
item_ids = item_user.index.tolist()
item_index = {m:i for i,m in enumerate(item_ids)}

max_items_for_dense = 5000
use_reduced = False
reduced_items = None
reduced_index = None

if item_user.shape[0] <= max_items_for_dense:
    print("Computing FULL item-item cosine similarity (dense).")
    item_sim = cosine_similarity(item_user.values)
else:
    # reduced top-k similarity
    topk = 2000
    reduced_items = [m for m in popularity_rank[:topk] if m in item_ids]
    print(f"Computing REDUCED item-item similarity on top-{len(reduced_items)} items.")
    reduced_matrix = item_user.loc[reduced_items].values
    item_sim = cosine_similarity(reduced_matrix)
    reduced_index = {m:i for i,m in enumerate(reduced_items)}
    use_reduced = True

# ---------------------------
# Safe item-CF recommender
# ---------------------------
def recommend_item_cf(user_id, train_df=train, N=10):
    user_ratings = train_df[train_df.userId==user_id][['movieId','rating']]
    if user_ratings.empty:
        return recommend_popular(user_id, train_df, N)

    scores = {}
    seen = set(user_ratings.movieId.tolist())

    for _, row in user_ratings.iterrows():
        mid, r = int(row.movieId), float(row.rating)
        if mid not in item_index:
            continue
        if not use_reduced:
            sims = item_sim[item_index[mid]]
            target_ids = item_ids
        else:
            if mid not in reduced_index:
                # skip rated item not in reduced sim set
                continue
            sims = item_sim[reduced_index[mid]]
            target_ids = reduced_items

        for j, sim in enumerate(sims):
            target_mid = target_ids[j]
            if target_mid in seen:
                continue
            scores[target_mid] = scores.get(target_mid, 0.0) + sim * (r - 3.0)

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    recs = [m for m,_ in ranked][:N]

    if len(recs) < N:
        for m in popularity_rank:
            if m not in seen and m not in recs:
                recs.append(m)
            if len(recs) >= N:
                break
    return recs

# ---------------------------
# SVD Matrix Factorization
# ---------------------------
print("\nBuilding user-item matrix for SVD...")
user_item = train.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
R = user_item.values
print("user-item shape:", R.shape)

R_mean = np.mean(R, axis=1)
R_demeaned = R - R_mean.reshape(-1,1)
k = min(50, min(R_demeaned.shape)-1)
if k <= 0:
    raise SystemExit("Not enough data for SVD. Need more users/movies.")
print("Using k =", k)
U, sigma, Vt = svds(R_demeaned, k=k)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + R_mean.reshape(-1,1)
preds_df = pd.DataFrame(all_user_predicted_ratings, index=user_item.index, columns=user_item.columns)
print("Predictions shape:", preds_df.shape)

def recommend_svd(user_id, N=10, preds=preds_df, train_df=train):
    if user_id not in preds.index:
        return recommend_popular(user_id, train_df, N)
    seen = set(train_df[train_df.userId==user_id].movieId.tolist())
    user_row = preds.loc[user_id]
    ranked = [m for m in user_row.sort_values(ascending=False).index if m not in seen]
    recs = ranked[:N]
    if len(recs) < N:
        for m in popularity_rank:
            if m not in seen and m not in recs:
                recs.append(m)
            if len(recs) >= N:
                break
    return recs

# ---------------------------
# Evaluation functions
# ---------------------------
def precision_recall_at_k(recommended, relevant, k=10):
    if len(recommended) == 0:
        return 0.0, 0.0
    rec_k = recommended[:k]
    hits = set(rec_k) & set(relevant)
    precision = len(hits) / k
    recall = len(hits) / (len(relevant) if len(relevant) > 0 else 1)
    return precision, recall

# ---------------------------
# Quick evaluation on sample users
# ---------------------------
print("\nEvaluating on sample test users...")
test_users_all = test.userId.unique()
n_sample = min(500, len(test_users_all))
test_users = np.random.choice(test_users_all, size=n_sample, replace=False)

prec_pop=[]; rec_pop=[]
prec_item=[]; rec_item=[]
prec_svd=[]; rec_svd=[]

rmse_preds=[]; rmse_truth=[]

for uid in test_users:
    user_test = test[test.userId==uid]
    relevant = user_test[user_test.rating >= 4.0].movieId.tolist()
    recs_pop = recommend_popular(uid)
    recs_item = recommend_item_cf(uid)
    recs_svd = recommend_svd(uid)
    p,r = precision_recall_at_k(recs_pop, relevant); prec_pop.append(p); rec_pop.append(r)
    p,r = precision_recall_at_k(recs_item, relevant); prec_item.append(p); rec_item.append(r)
    p,r = precision_recall_at_k(recs_svd, relevant); prec_svd.append(p); rec_svd.append(r)
    # RMSE
    for _, row in user_test.iterrows():
        mid = row.movieId
        true_r = row.rating
        if uid in preds_df.index and mid in preds_df.columns:
            rmse_preds.append(preds_df.loc[uid, mid]); rmse_truth.append(true_r)

def avg(a): return np.mean(a) if len(a)>0 else 0.0
metrics = {
    "pop_prec@10": avg(prec_pop), "pop_rec@10": avg(rec_pop),
    "item_prec@10": avg(prec_item), "item_rec@10": avg(rec_item),
    "svd_prec@10": avg(prec_svd), "svd_rec@10": avg(rec_svd),
    "svd_rmse": sqrt(mean_squared_error(rmse_truth, rmse_preds)) if len(rmse_truth)>0 else None
}

print("\nMetrics summary:")
for k,v in metrics.items():
    print(f"{k}: {v}")

# ---------------------------
# Print sample recommendations
# ---------------------------
def title_for(m):
    return movie_titles.get(m, str(m))

print("\nSample SVD recommendations for 5 random test users:")
sample_u = np.random.choice(test.userId.unique(), size=min(5, len(test.userId.unique())), replace=False)
for uid in sample_u:
    recs = recommend_svd(uid, N=10)
    print(f"User {uid} ->", [title_for(m) for m in recs])

# Save preds
try:
    preds_df.to_csv("svd_predicted_ratings_autodetect.csv")
    print("\nSaved preds to svd_predicted_ratings_autodetect.csv")
except Exception as e:
    print("Could not save predictions:", e)


Searched folders: ['.', '/mnt/data']
Found files:
  ./ratings.csv
  ./movies.csv

Auto-detected files:
 movies: ./movies.csv
 ratings: ./ratings.csv
 users: None

Loaded shapes -> movies: (62423, 3) ratings: (1743832, 4) users: None
Filling 1 missing/invalid timestamps with random values.

Ratings preview:
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510

Train size: (1395065, 4), Test size: (348767, 4)

Building item-user matrix...
Computing REDUCED item-item similarity on top-2000 items.

Building user-item matrix for SVD...
user-item shape: (9845, 18315)
Using k = 50
Predictions shape: (9845, 18315)

Evaluating on sample test users...
