In [None]:
!pip install implicit

Collecting implicit
  Using cached implicit-0.7.2.tar.gz (70 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.7.2-cp312-cp312-linux_x86_64.whl size=10797946 sha256=e5bc1496bf153619df9fa683a262e1a014e317a2cd9731b177bfb67ed9a7ff9c
  Stored in directory: /root/.cache/pip/wheels/b2/00/4f/9ff8af07a0a53ac6007ea5d739da19cfe147a2df542b6899f8
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
# Imports
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from implicit.als import AlternatingLeastSquares
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import logging

import boto3
from sagemaker import get_execution_role

ModuleNotFoundError: No module named 'implicit'

# DATA

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

conn = boto3.client('s3')
bucket_name = 'ecomrecdata'
contnet = conn.list_objects(Bucket=bucket_name)

category_tree = pd.read_csv(f's3://{bucket_name}/data/category_tree.csv')
events = pd.read_csv(f's3://{bucket_name}/data/events.csv')
item_props1 = pd.read_csv(f's3://{bucket_name}/data/item_properties_part1.csv')
item_props2 = pd.read_csv(f's3://{bucket_name}/data/item_properties_part2.csv')

# Sample 10% of events
logging.info(f"Original events shape: {events.shape}")
events = events.sample(frac=0.1, random_state=42)
logging.info(f"Sampled events shape: {events.shape}")


In [None]:
# 2. Combine & Clean
item_properties = pd.concat([item_props1, item_props2], ignore_index=True)
events = events.dropna()
events['timestamp'] = pd.to_datetime(events['timestamp'], errors='coerce')
events = events.dropna(subset=['timestamp'])

In [None]:
min_user_events = 5
min_item_events = 5
user_counts = events['visitorid'].value_counts()
item_counts = events['itemid'].value_counts()
events = events[events['visitorid'].isin(user_counts[user_counts >= min_user_events].index)]
events = events[events['itemid'].isin(item_counts[item_counts >= min_item_events].index)]
logging.info(f"Filtered events shape: {events.shape}")

In [None]:
# 4. Add weight column based on event type
event_weights = {'view': 1, 'addtocart': 3, 'transaction': 5}
events['weight'] = events['event'].map(event_weights)

In [None]:
# 5. User-Item Matrix
user_item_matrix = events.pivot_table(
    index='visitorid',
    columns='itemid',
    values='weight',
    aggfunc='sum',
    fill_value=0
)
user_item_sparse = csr_matrix(user_item_matrix.values)
logging.info(f"User-item matrix shape: {user_item_matrix.shape}")

# Machine Learning model

In [None]:
# 6. ALS Training
train_data, test_data = train_test_split(events, test_size=0.2, random_state=42)
train_matrix = train_data.pivot_table(index='visitorid', columns='itemid', values='weight', aggfunc='sum', fill_value=0)
train_sparse = csr_matrix(train_matrix.values)
logging.info(f"Train matrix shape: {train_matrix.shape}")

# Build user and item maps
user_map = dict(enumerate(train_matrix.index))
item_map = dict(enumerate(train_matrix.columns))
user_inv_map = {v: k for k, v in user_map.items()}
item_inv_map = {v: k for k, v in item_map.items()}

# Fit ALS
model = AlternatingLeastSquares(factors=32, regularization=0.1, iterations=15)
model.fit(train_sparse.T)
logging.info("ALS model trained")

In [None]:
# 7. Create item_metadata
item_properties.sort_values("timestamp", inplace=True)
item_properties.drop_duplicates(subset=["itemid", "property"], keep="last", inplace=True)
item_metadata = item_properties.pivot(index='itemid', columns='property', values='value').reset_index()

# Filter item_metadata to items in train_matrix
item_metadata = item_metadata[item_metadata['itemid'].isin(train_matrix.columns)]

for col in ['brand', 'categoryid', 'color']:
    if col not in item_metadata.columns:
        item_metadata[col] = ''

item_metadata['text'] = item_metadata[['brand', 'categoryid', 'color']].fillna('').agg(' '.join, axis=1)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(item_metadata['text'])
item_ids = item_metadata['itemid'].values
itemid_to_index = {itemid: idx for idx, itemid in enumerate(item_ids)}

Saving artifacts

In [None]:
import joblib

import os

bucket_name = 'ecomrecdata'
s3_prefix = 'data'
loca
os.makedirs(local_dir, exist_ok=True)

joblib.dump(user_inv_map, os.path.join(local_dir, 'user_inv_map.joblib'))
joblib.dump(item_map, os.path.join(local_dir, 'item_map.joblib'))
joblib.dump(train_sparse.joblib, os.path.join(local_dir, 'train_sparse.joblib'))
train_data.to_csv(os.path.join(local_dir, 'train_data.csv'), index=False)
joblib.dump(item_id_index, os.path.join(local_dir, 'item_id_index.joblib'))
joblib.dump(item_ids, os.path.join(local_dir, 'item_ids.joblib'))
joblib.dump(tfidf_matrix, os.path.join(local_dir, 'tfidf_matrix.joblib'))

In [None]:
import boto3
import os
import joblib
import pandas as pd

bucket_name = 'ecomrecdata'
s3_prefix = 'data'
local_dir = 'temp_model_files'

def upload_local_directory_to_s3(local_directory, bucket, s3_prefix):
    """
    Uploads all files in a local directory to a specific prefix in an S3 bucket.
    """
    s3 = boto3.client('s3')

    for root, dirs, files in os.walk(local_directory):
        for filename in files:
            local_path = os.path.join(root, filename)
            s3_key = os.path.join(s3_prefix, filename)

            print(f"Uploading {local_path} to s3://{bucket}/{s3_key}...")

            try:
                s3.upload_file(local_path, bucket, s3_key)
                print(f"Successfully uploaded: {filename}")
            except Exception as e:
                print(f"Failed to upload {filename}. Error: {e}")

upload_local_directory_to_s3(local_dir, bucket_name, s3_prefix)
