In [None]:
# %% [markdown]
# # Building a Recommendation System Using CNN 
# This notebook builds a CNN-based Fashion Embedding model and integrates full MLOps monitoring:
# - MLflow for experiment tracking  
# - Prometheus & Grafana for system metrics  
# - Evidently AI for data drift reporting

# %% [markdown]
# ## Step 1: Install Dependencies
!pip install tensorflow==2.12 keras mlflow evidently prometheus_client psutil matplotlib seaborn scikit-learn pandas numpy opencv-python --quiet

# %% [markdown]
# ## Step 2: Imports and Monitoring Setup
import os
import time
import threading
import psutil
import mlflow
import mlflow.keras
from prometheus_client import Gauge, start_http_server
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.manifold import TSNE
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import GlobalMaxPooling2D

# %% [markdown]
# ## Step 3: MLflow Configuration
# MLFLOW_TRACKING_URI = "http://<YOUR-EC2-IP>:5000"  # <-- Replace with your EC2 public IP
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"  # <-- local

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("CNN_Fashion_Recommender")
mlflow.start_run(run_name="CNN_Embedding_Model")

mlflow.log_param("base_model", "ResNet50")

# %% [markdown]
# ## Step 4: Prometheus System Metrics Setup
# Start Prometheus metrics exporter on port 8000
start_http_server(8000)

cpu_usage = Gauge('cpu_usage_percent', 'CPU usage percentage')
ram_usage = Gauge('ram_usage_percent', 'RAM usage percentage')

print("✅ Prometheus metrics available at :8000")

def monitor_system():
    while True:
        cpu_usage.set(psutil.cpu_percent())
        ram_usage.set(psutil.virtual_memory().percent)
        time.sleep(5)

threading.Thread(target=monitor_system, daemon=True).start()

# %% [markdown]
# ## Step 5: Data Preparation
DATASET_PATH = "/kaggle/input/fashion-product-images-dataset/fashion-dataset/fashion-dataset/"
print(os.listdir(DATASET_PATH))

df = pd.read_csv(DATASET_PATH + "styles.csv", nrows=5000, on_bad_lines='skip')
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.reset_index(drop=True)
mlflow.log_param("dataset_size", len(df))
df.head()

# %% [markdown]
# ### Image Utilities
def img_path(img):
    return DATASET_PATH+"/images/"+img

def load_image(img, resized_fac=0.1):
    img = cv2.imread(img_path(img))
    if img is None:
        return np.zeros((224,224,3), dtype=np.uint8)
    w, h, _ = img.shape
    resized = cv2.resize(img, (int(h*resized_fac), int(w*resized_fac)), interpolation=cv2.INTER_AREA)
    return resized

def plot_figures(figures, nrows=1, ncols=1, figsize=(8,8)):
    fig, axeslist = plt.subplots(ncols=ncols, nrows=nrows, figsize=figsize)
    for ind,title in enumerate(figures):
        axeslist.ravel()[ind].imshow(cv2.cvtColor(figures[title], cv2.COLOR_BGR2RGB))
        axeslist.ravel()[ind].set_title(title)
        axeslist.ravel()[ind].set_axis_off()
    plt.tight_layout()

# %% [markdown]
# ## Step 6: Model Creation
img_width, img_height = 224, 224

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
base_model.trainable = False

model = Sequential([
    base_model,
    GlobalMaxPooling2D()
])

model.summary()
mlflow.log_param("trainable_layers", 0)

# %% [markdown]
# ## Step 7: Embedding Extraction
def get_embedding(model, img_name):
    img = image.load_img(img_path(img_name), target_size=(img_width, img_height))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x).reshape(-1)

sample_img = df.iloc[0].image
emb = get_embedding(model, sample_img)
mlflow.log_param("embedding_dim", len(emb))
print("Embedding shape:", emb.shape)

# %% [markdown]
# ## Step 8: Compute Embeddings for Dataset
df_sample = df.sample(1000, random_state=42)  # limit for speed
df_embs = df_sample['image'].apply(lambda img: get_embedding(model, img))
df_embs = df_embs.apply(pd.Series)

mlflow.log_metric("processed_images", len(df_embs))
df_embs.to_csv("embeddings.csv", index=False)
mlflow.log_artifact("embeddings.csv")

# %% [markdown]
# ## Step 9: Compute Similarity Matrix
cosine_sim = 1 - pairwise_distances(df_embs, metric='cosine')
print("Cosine similarity computed.")
mlflow.log_metric("mean_cosine_similarity", float(np.mean(cosine_sim)))

# %% [markdown]
# ## Step 10: Data Drift Detection with Evidently AI
df_ref = df_sample.sample(frac=0.7, random_state=42)
df_test = df_sample.drop(df_ref.index)

report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=df_ref, current_data=df_test)
report.save_html("evidently_drift_report.html")

mlflow.log_artifact("evidently_drift_report.html")

# Serve locally (if on EC2)
!python3 -m http.server 7000 --directory .

# print("✅ Evidently Dashboard: http://<YOUR-EC2-IP>:7000/evidently_drift_report.html")
print("✅ Evidently Dashboard: http://127.0.0.1:7000/evidently_drift_report.html")


# %% [markdown]
# ## Step 11: Visualization of Embeddings
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df_embs)
df_sample['tsne-2d-one'] = tsne_results[:,0]
df_sample['tsne-2d-two'] = tsne_results[:,1]

plt.figure(figsize=(12,8))
sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue="masterCategory", data=df_sample, legend="full", alpha=0.8)
plt.savefig("tsne_clusters.png")
mlflow.log_artifact("tsne_clusters.png")

# %% [markdown]
# ## Step 12: Wrap Up
mlflow.log_metric("total_categories", df['masterCategory'].nunique())
mlflow.end_run()

# print("✅ MLflow Tracking URL:", MLFLOW_TRACKING_URI)
# print("✅ Prometheus running on port 8000")
# print("✅ Grafana: http://<YOUR-EC2-IP>:3000")
# print("✅ Evidently Dashboard: http://<YOUR-EC2-IP>:7000/evidently_drift_report.html")
print("✅ MLflow Tracking URL: http://127.0.0.1:5000")
print("✅ Prometheus running on port 8000")
print("✅ Grafana: http://127.0.0.1:3000")
print("✅ Evidently Dashboard: http://127.0.0.1:7000/evidently_drift_report.html")


In [None]:
# %% [markdown]
# # 🧥 Building a CNN-Based Fashion Recommendation System (Local MLOps)
# This notebook builds a CNN-based Fashion Embedding model with:
# - MLflow for experiment tracking (localhost:5000)
# - Prometheus & Grafana for live system metrics
# - Evidently AI for drift monitoring (localhost:7000)

# %% [markdown]
# ## Step 1: Install Dependencies
!pip install  keras mlflow evidently prometheus_client psutil matplotlib seaborn scikit-learn pandas numpy opencv-python --quiet

# %% [markdown]
# ## Step 2: Imports and Monitoring Setup
import os
import time
import threading
import psutil
import mlflow
import mlflow.keras
from prometheus_client import Gauge, start_http_server
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.manifold import TSNE
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import GlobalMaxPooling2D

# %% [markdown]
# ## Step 3: MLflow Configuration (Localhost)
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"  # Local MLflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("CNN_Fashion_Recommender")
mlflow.start_run(run_name="CNN_Embedding_Model")

mlflow.log_param("base_model", "ResNet50")

# %% [markdown]
# ## Step 4: Prometheus System Metrics Setup (localhost:8000)
start_http_server(8000)

cpu_usage = Gauge('cpu_usage_percent', 'CPU usage percentage')
ram_usage = Gauge('ram_usage_percent', 'RAM usage percentage')

print("✅ Prometheus metrics available at :8000")

def monitor_system():
    while True:
        cpu_usage.set(psutil.cpu_percent())
        ram_usage.set(psutil.virtual_memory().percent)
        time.sleep(5)

threading.Thread(target=monitor_system, daemon=True).start()

# %% [markdown]
# ## Step 5: Data Preparation
DATASET_PATH = "path_to_your_local_dataset/fashion-dataset/"  # e.g., C:/Users/USER/Downloads/fashion-dataset/
df = pd.read_csv(DATASET_PATH + "styles.csv", nrows=5000, on_bad_lines='skip')
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.reset_index(drop=True)
mlflow.log_param("dataset_size", len(df))
df.head()

def img_path(img):
    return os.path.join(DATASET_PATH, "images", img)

# %% [markdown]
# ## Step 6: Model Creation
img_width, img_height = 224, 224
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
base_model.trainable = False

model = Sequential([
    base_model,
    GlobalMaxPooling2D()
])

mlflow.log_param("trainable_layers", 0)

# %% [markdown]
# ## Step 7: Embedding Extraction
def get_embedding(model, img_name):
    img = image.load_img(img_path(img_name), target_size=(img_width, img_height))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x).reshape(-1)

sample_img = df.iloc[0].image
emb = get_embedding(model, sample_img)
mlflow.log_param("embedding_dim", len(emb))

# %% [markdown]
# ## Step 8: Compute Embeddings and Similarity
df_sample = df.sample(1000, random_state=42)
df_embs = df_sample['image'].apply(lambda img: get_embedding(model, img))
df_embs = df_embs.apply(pd.Series)
mlflow.log_metric("processed_images", len(df_embs))

cosine_sim = 1 - pairwise_distances(df_embs, metric='cosine')
mlflow.log_metric("mean_cosine_similarity", float(np.mean(cosine_sim)))

# %% [markdown]
# ## Step 9: Evidently Data Drift Dashboard (localhost:7000)
df_ref = df_sample.sample(frac=0.7, random_state=42)
df_test = df_sample.drop(df_ref.index)

report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=df_ref, current_data=df_test)
report.save_html("evidently_drift_report.html")
mlflow.log_artifact("evidently_drift_report.html")

# Serve the report locally
!python3 -m http.server 7000 --directory .

print("✅ Evidently Dashboard: http://127.0.0.1:7000/evidently_drift_report.html")

# %% [markdown]
# ## Step 10: Wrap Up
mlflow.log_metric("total_categories", df['masterCategory'].nunique())
mlflow.end_run()

print("✅ MLflow Tracking URL: http://127.0.0.1:5000")
print("✅ Prometheus running on port 8000")
print("✅ Grafana: http://127.0.0.1:3000")
print("✅ Evidently Dashboard: http://127.0.0.1:7000/evidently_drift_report.html")
