In [14]:
# In[1]: Setup
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Display options for comfort
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

DATA_PATH = "../data/cleaned_data.csv"  # assumes your cleaned file is here

# In[2]: Load cleaned data
df = pd.read_csv(DATA_PATH)

# Defensive checks — your cleaner should already guarantee these, but just in case:
required_cols = {"InvoiceNo", "InvoiceDate", "CustomerID", "StockCode", "Description", "Quantity", "UnitPrice"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Dataset missing required columns: {missing}")

# Ensure types
df["CustomerID"] = df["CustomerID"].astype(str).str.replace(r"\.0$", "", regex=True)
df["StockCode"] = df["StockCode"].astype(str)
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
df = df.dropna(subset=["InvoiceDate"])

# If TotalPrice is missing, compute it
if "TotalPrice" not in df.columns:
    df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]

print(df.head(3))
print(f"\nRows: {len(df):,}  |  Customers: {df['CustomerID'].nunique():,}  |  Items: {df['StockCode'].nunique():,}")

# In[3]: Build user–item interaction matrix
# Signal choice: total quantity purchased per item
user_item_matrix = df.pivot_table(
    index="CustomerID",
    columns="StockCode",
    values="Quantity",
    aggfunc="sum",
    fill_value=0,
).astype(float)

user_item_matrix.shape

# In[4]: Compute user–user cosine similarity
# Note: cosine_similarity expects a 2D array; we pass the dense values of the matrix.
similarity = cosine_similarity(user_item_matrix.values)
similarity_df = pd.DataFrame(
    similarity, index=user_item_matrix.index, columns=user_item_matrix.index
)

# Sanity check: similarity is symmetric, diagonal ~ 1.0
(similarity_df.values.diagonal()[:5], similarity_df.shape)

# In[5]: Helper — get top similar users for a given user
def top_similar_users(user_id: str, k: int = 10) -> pd.Series:
    """
    Return top-k most similar users (excluding the user).
    """
    uid = str(user_id).replace(".0", "")
    if uid not in similarity_df.index:
        return pd.Series(dtype=float, name="similarity")

    sims = similarity_df.loc[uid].drop(uid, errors="ignore")
    return sims.sort_values(ascending=False).head(k)

# In[6]: Recommender — user-based CF with weighted scores
def recommend_for(user_id: str, top_n: int = 10, min_sim_users: int = 1) -> pd.DataFrame:
    uid = str(user_id).replace(".0", "")
    if uid not in user_item_matrix.index:
        return pd.DataFrame({"Message": [f"Customer {uid} not found."]})

    # 1) similarity vector aligned to all UIM users
    sim_vec = similarity_df.loc[uid].reindex(user_item_matrix.index).fillna(0.0)

    # 2) drop self and keep positives
    sim_vec = sim_vec.drop(uid, errors="ignore")
    positive_sims = sim_vec[sim_vec > 0]

    if len(positive_sims) < min_sim_users or positive_sims.sum() == 0:
        return pd.DataFrame({"Message": [f"No similar users found for {uid}."]})

    # 3) *** CRUCIAL: reindex to EXACTLY the UIM index ***
    aligned = positive_sims.reindex(user_item_matrix.index).fillna(0.0)

    # 4) score (either Pandas or NumPy version)
    weighted_scores = user_item_matrix.T.dot(aligned) / aligned.sum()
    # NumPy alternative:
    # scores_vals = user_item_matrix.T.values.dot(aligned.values) / aligned.sum()
    # weighted_scores = pd.Series(scores_vals, index=user_item_matrix.columns)

    # 5) remove items already bought
    purchased = user_item_matrix.loc[uid]
    weighted_scores = weighted_scores.drop(index=purchased[purchased > 0].index, errors="ignore")

    if weighted_scores.empty:
        return pd.DataFrame({"Message": [f"No unseen items to recommend for {uid}."]})

    meta = (df[["StockCode","Description"]].drop_duplicates("StockCode").set_index("StockCode"))
    recs = (pd.DataFrame({"StockCode": weighted_scores.index, "Estimated Score": weighted_scores.values})
              .join(meta, on="StockCode")
              .sort_values("Estimated Score", ascending=False)
              .head(top_n)
              .reset_index(drop=True))
    if "Description" not in recs.columns:
        recs["Description"] = "N/A"
    return recs[["StockCode","Description","Estimated Score"]]

# In[7]: (Optional) Explainer — show which users drove the recommendation
def explain_user_influence(user_id: str, k_users: int = 10) -> pd.DataFrame:
    """
    Show top-k similar users and their similarity scores.
    Useful to debug / explain why recommendations were proposed.
    """
    uid = str(user_id).replace(".0", "")
    sims = top_similar_users(uid, k=k_users)
    return sims.reset_index().rename(columns={"index": "SimilarUserID", uid: "Similarity"}).rename(columns={0: "Similarity"})

# In[8]: Try it out — change '17850' to any ID in your dataset
TEST_USER = "17850"

print("Top similar users:")
display(explain_user_influence(TEST_USER, k_users=10))

print("\nTop recommendations:")
display(recommend_for(TEST_USER, top_n=10))

# In[9]: (Optional) Save artifacts for inspection or downstream use
# These can be big — use compressed formats where possible.

# Save user–item matrix
user_item_matrix.to_csv("artifacts/user_item_matrix.csv.gz", compression="gzip")

# Save similarity matrix (can be large; Parquet is more compact than CSV)
try:
    similarity_df.to_parquet("artifacts/user_similarity.parquet")
except Exception:
    # Fall back to CSV if parquet engine isn't available
    similarity_df.to_csv("artifacts/user_similarity.csv.gz", compression="gzip")

print("✅ Saved artifacts in artifacts/")

# In[10]: (Optional) Quick health checks
assert user_item_matrix.index.equals(similarity_df.index)
assert user_item_matrix.index.equals(similarity_df.columns)
print("✅ Indices/columns aligned for dot products")
print("UIM shape:", user_item_matrix.shape, " | Sim shape:", similarity_df.shape)

   InvoiceNo StockCode                         Description  Quantity         InvoiceDate  UnitPrice CustomerID  \
0     536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER         6 2010-12-01 08:26:00       2.55      17850   
1     536365     71053                 WHITE METAL LANTERN         6 2010-12-01 08:26:00       3.39      17850   
2     536365    84406B      CREAM CUPID HEARTS COAT HANGER         8 2010-12-01 08:26:00       2.75      17850   

          Country  TotalPrice  
0  United Kingdom       15.30  
1  United Kingdom       20.34  
2  United Kingdom       22.00  

Rows: 397,924  |  Customers: 4,339  |  Items: 3,665
Top similar users:


Unnamed: 0,CustomerID,Similarity
0,13161,0.409775
1,13831,0.360096
2,14440,0.332434
3,14209,0.314041
4,15636,0.312265
5,15021,0.307447
6,18105,0.305329
7,17616,0.30133
8,18106,0.293862
9,15044,0.291849



Top recommendations:


Unnamed: 0,StockCode,Description,Estimated Score
0,85099B,JUMBO BAG RED RETROSPOT,15.818855
1,22197,SMALL POPCORN HOLDER,13.472471
2,84879,ASSORTED COLOUR BIRD ORNAMENT,12.829928
3,22469,HEART OF WICKER SMALL,10.777115
4,21175,GIN + TONIC DIET METAL SIGN,10.074876
5,21108,FAIRY CAKE FLANNEL ASSORTED COLOUR,9.311005
6,22178,VICTORIAN GLASS HANGING T-LIGHT,8.355608
7,22470,HEART OF WICKER LARGE,7.856725
8,21733,RED HANGING HEART T-LIGHT HOLDER,7.460722
9,71477,COLOUR GLASS. STAR T-LIGHT HOLDER,6.681703


OSError: Cannot save file into a non-existent directory: 'artifacts'

Computing the msd similarity matrix...
Done computing similarity matrix.
KNN Model RMSE:
RMSE: 312.2391


312.23911795840945

Predicted TotalPrice for user 17850.0 on item 85123A: 168469.60
