In [3]:
# -----------------------------
# Standard Library Imports
# -----------------------------
import os
import re
import json
import logging
import warnings
import random
import csv
import ast
import base64
import multiprocessing
import traceback
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import List, Dict, Any

# -----------------------------
# Third-Party Library Imports
# -----------------------------
import numpy as np
import pandas as pd
import torch
import faiss
import redis
import jsonpickle
import nltk
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from faker import Faker
from textblob import TextBlob
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup

# -----------------------------
# Plotting and Visualization Imports
# -----------------------------
import plotly.graph_objects as go
import plotly.express as px

# -----------------------------
# IPython and Widget Imports
# -----------------------------
import ipywidgets as widgets
from IPython.display import display, HTML

# -----------------------------
# NLTK and Natural Language Processing Imports
# -----------------------------
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('omw-1.4', quiet=True)

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# -----------------------------
# Warnings and Logging Setup
# -----------------------------
warnings.filterwarnings('ignore')
logging.getLogger('transformers').setLevel(logging.ERROR)

# -----------------------------
# Other Setup
# -----------------------------
# Additional initialization or configuration code (if any)


In [1]:

# Load the terpene and cannabinoid info from the provided JSON files in Jupyter format
with open('backend/data/terpene_info.json', encoding='utf-8') as terpene_file:
    terpene_info = json.load(terpene_file)

with open('/data/cannabinoid_info.json', encoding='utf-8') as cannabinoid_file:
    cannabinoid_info = json.load(cannabinoid_file)

# Assign positions, colors, and icons to cannabinoids
cannabinoids = []
cannabinoid_positions = np.linspace(1.5, 2.0, len(cannabinoid_info))
for i, (name, info) in enumerate(cannabinoid_info.items()):
    angle = 360 * i / len(cannabinoid_info)
    x = np.cos(np.radians(angle)) * cannabinoid_positions[i]
    y = np.sin(np.radians(angle)) * cannabinoid_positions[i]
    cannabinoids.append({
        'name': name,
        'color': f'hsl({(i * 40) % 360}, 70%, 60%)',
        'x': x,
        'y': y,
        'icon': '🟢' if not info['psychoactive'] else '🔴'
    })

# Assign positions, colors, and icons to terpenes
terpenes = []
terpene_positions = np.linspace(1.2, 1.5, len(terpene_info))
for i, (name, info) in enumerate(terpene_info.items()):
    angle = 360 * i / len(terpene_info) + (180 / len(terpene_info))
    x = np.cos(np.radians(angle)) * terpene_positions[i]
    y = np.sin(np.radians(angle)) * terpene_positions[i]
    terpenes.append({
        'name': name,
        'color': f'hsl({(i * 30) % 360}, 70%, 70%)',
        'x': x,
        'y': y,
        'icon': info.get('icon', '🍃')
    })

# Create the main Plotly figure
fig = go.Figure()

# Function to add labels and hover text for terpenes and cannabinoids with enhanced layout
def add_label(item, info_dict, label_type='terpene'):
    name = item['name']
    color = item['color']
    x = item['x']
    y = item['y']
    icon = item['icon']
    info = info_dict.get(name, {})

    # Define hover text layout using HTML-like formatting
    if label_type == 'terpene':
        hovertext = (
            f"<b>{name}</b><br>"
            f"<i>Aroma:</i> {', '.join(info.get('aroma_profile', []))}<br>"
            f"<i>Effects:</i> {', '.join(info.get('effects', []))}<br>"
            f"<i>Potential Benefits:</i><br>• {'<br>• '.join(info.get('potential_benefits', []))}<br>"
            f"<i>Entourage Effect:</i><br>{info.get('entourage_effect', 'N/A')}"
        )
    elif label_type == 'cannabinoid':
        psychoactive = info.get('psychoactive', False)
        psychoactive = 'Yes' if psychoactive else 'No'

        # Enhanced hovertext layout for cannabinoids with bullet points and spacing
        hovertext = (
            f"<b>{name} ({info.get('full_name', '')})</b><br>"
            f"<i>Effects:</i><br>• {'<br>• '.join(info.get('effects', [])[:5])}..."  # Limit to first 5 effects
            f"<br><i>Medical Applications:</i><br>• {'<br>• '.join(info.get('medical_applications', [])[:3])}..."  # Limit to 3 applications
            f"<br><i>Psychoactive:</i> {psychoactive}<br>"
            f"<i>Entourage Effect:</i><br>{info.get('entourage_effect', 'N/A')}"
        )

    # Add the label to the Plotly figure
    fig.add_trace(go.Scatter(
        x=[x],
        y=[y],
        mode='markers+text',
        marker=dict(
            size=20,
            color=color,
            symbol='circle',
            line=dict(width=2, color='white')
        ),
        text=f"{icon}\n{name}",
        textposition='top center',
        textfont=dict(size=10, color='white', family='Arial'),
        hoverinfo='text',
        hovertext=hovertext,
        name=name,
        showlegend=False,
        opacity=0.9
    ))

# Add terpenes and cannabinoids to the figure
for terpene in terpenes:
    add_label(terpene, terpene_info, label_type='terpene')

for cannabinoid in cannabinoids:
    add_label(cannabinoid, cannabinoid_info, label_type='cannabinoid')

# Full-screen and layout improvements
fig.update_layout(
    title={
        'text': "Understanding the Entourage Effect:<br>Greater Than The Sum of Its Parts",
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=24, color='white', family='Arial')
    },
    xaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False,
        range=[-2.5, 2.5]
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False,
        range=[-2.5, 2.5],
        scaleanchor="x",
        scaleratio=1
    ),
    paper_bgcolor='rgba(30, 30, 30, 1)',
    plot_bgcolor='rgba(30, 30, 30, 1)',
    width=900,
    height=900,
    margin=dict(l=50, r=50, t=150, b=50),
    font=dict(color='white', family='Arial'),
    updatemenus=[
        {
            "buttons": [
                {
                    "args": [{"fullscreen": True}],
                    "label": "Full Screen",
                    "method": "relayout",
                }
            ],
            "direction": "left",
            "pad": {"r": 10, "t": 10},
            "showactive": False,
            "type": "buttons",
            "x": 0.0,
            "xanchor": "left",
            "y": 1.2,
            "yanchor": "top"
        }
    ]
)

# Add a descriptive annotation about the entourage effect
fig.add_annotation(
    x=0,
    y=-2.2,
    text=(
        "<b>The Entourage Effect</b><br>"
        "The Entourage Effect refers to the synergistic interaction between various cannabinoids and terpenes, "
        "suggesting that these compounds work together to produce a more pronounced therapeutic effect "
        "than any single compound alone.<br><i>Hover over the compounds to learn more about each one.</i>"
    ),
    showarrow=False,
    font=dict(size=12, color='white', family='Arial'),
    align='center',
    bordercolor='white',
    borderwidth=1,
    bgcolor='rgba(42, 42, 42, 0.8)',
    xanchor='center',
    yanchor='top'
)

# Add the static background image (ensure image path is correct)
with open(".venv/Weedimage.png", "rb") as image_file:
    encoded_image = base64.b64encode(image_file.read()).decode()

fig.update_layout(
    images=[
        dict(
            source='data:image/png;base64,{}'.format(encoded_image),
            xref="paper",
            yref="paper",
            x=0,
            y=1,
            sizex=1,
            sizey=1,
            sizing="stretch",
            opacity=0.5,
            layer="below"
        )
    ]
)

# Display the figure
fig.show()


FileNotFoundError: [Errno 2] No such file or directory: 'backend/data/terpene_info.json'

In [2]:
from functools import lru_cache

# Suppress warnings from PyTorch and Transformers
warnings.filterwarnings("ignore", category=UserWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)

# Ensure that progress_apply is properly registered
tqdm.pandas()

# -----------------------------
# Step 1: Load terpene and cannabinoid info from files
# -----------------------------
def load_json(filepath: str):
    if os.path.exists(filepath):
        with open(filepath, 'r') as f:
            return json.load(f)
    else:
        logging.warning(f"{filepath} not found, loading empty dictionary.")
        return {}

terpene_info = load_json('.venv/terpene_info.json')
cannabinoid_info = load_json('.venv/cannabinoid_info.json')

# -----------------------------
# Step 2: Define regular expressions and synonym cache 
# -----------------------------
THC_REGEX = re.compile(r'thc[a-z]*:\s*([\d.]+)%', re.IGNORECASE)
CBD_REGEX = re.compile(r'cbd[a-z]*:\s*([\d.]+)%', re.IGNORECASE)
HTML_TAGS_REGEX = re.compile(r'<br>|<[^>]+>')
NON_ASCII_REGEX = re.compile(r'[^\x00-\x7F]+')
NON_ALPHABETIC_REGEX = re.compile(r'[^a-zA-Z\s]')
PIPE_COMMA_REGEX = re.compile(r'\||,')
UNWANTED_ENTRIES = {'unknown', 'not yet listed', 'not available'}
EXTRA_SPACES_REGEX = re.compile(r'\s+')

@lru_cache(maxsize=None)
def get_synonyms(word: str) -> set:
    synonyms = set()
    for syn in wordnet.synsets(word):
        synonyms.update(lemma.name().replace('_', ' ').lower() for lemma in syn.lemmas())
    synonyms.discard(word.lower())
    return synonyms

# -----------------------------
# Step 3: Define data cleaning functions
# -----------------------------
def clean_description_series(text_series: pd.Series) -> pd.Series:
    return (
        text_series.astype(str)
        .str.replace(HTML_TAGS_REGEX, '', regex=True)
        .str.encode('ascii', 'ignore').str.decode('ascii')
        .str.replace(NON_ALPHABETIC_REGEX, ' ', regex=True)
        .str.lower().str.strip()
        .str.replace(EXTRA_SPACES_REGEX, ' ', regex=True)
    )

def clean_pipe_columns_series(text_series: pd.Series) -> pd.Series:
    return (
        text_series.astype(str)
        .str.split(PIPE_COMMA_REGEX)
        .apply(lambda parts: ', '.join(sorted(
            set(part.strip() for part in parts if part.strip().lower() not in UNWANTED_ENTRIES and part.strip())
        )) if parts else 'Unknown')
    )

def standardize_type_series(type_series: pd.Series) -> pd.Series:
    type_series = type_series.astype(str).str.lower().str.strip()
    return np.select(
        [
            type_series.str.contains('hybrid'),
            type_series.str.contains('indica'),
            type_series.str.contains('sativa')
        ],
        ['Hybrid', 'Indica', 'Sativa'],
        default='Unknown'
    )

# -----------------------------
# Step 4: Define mappings for cannabinoids and terpenes
# -----------------------------
flavor_aroma_to_terpene = defaultdict(set)
cannabinoid_synonyms = {}

# Process terpenes
for terpene, info in tqdm(terpene_info.items(), desc='Processing Terpenes'):
    for flavor in info.get('taste_profile', []):
        flavor_lower = flavor.lower()
        flavor_aroma_to_terpene[flavor_lower].add(terpene)
        for syn in get_synonyms(flavor_lower):
            flavor_aroma_to_terpene[syn].add(terpene)

# Process cannabinoids
for cannabinoid, info in tqdm(cannabinoid_info.items(), desc='Processing Cannabinoids'):
    cannabinoid_lower = cannabinoid.lower()
    synonyms = get_synonyms(cannabinoid_lower)
    synonyms.add(cannabinoid_lower)
    cannabinoid_synonyms[cannabinoid_lower] = synonyms

# -----------------------------
# Step 5: Define mapping functions
# -----------------------------
def map_cannabinoids(description: str) -> str:
    all_cannabinoids = set(cannabinoid_info.keys())
    if pd.isnull(description) or not description.strip():
        return ', '.join(sorted(all_cannabinoids))
    tokens = set(description.lower().split())
    cannabinoids_found = {cannabinoid.title() for cannabinoid, synonyms in cannabinoid_synonyms.items() if tokens & synonyms}
    return ', '.join(sorted(cannabinoids_found)) if cannabinoids_found else ', '.join(sorted(all_cannabinoids))

def map_terpenes(description: str) -> str:
    if pd.isnull(description) or not description.strip():
        return 'Unknown'
    tokens = set(description.lower().split())
    terpenes_found = set()
    for word in tokens:
        terpenes_found.update(flavor_aroma_to_terpene.get(word, set()))
    return ', '.join(sorted(terpenes_found)) if terpenes_found else 'Unknown'

# -----------------------------
# Step 6: Load strain data and apply cleaning functions
# -----------------------------
def clean_strain_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['Strain Description'] = clean_description_series(df['Strain Description'])
    df.drop_duplicates(subset=['Strain_Name', 'Strain Description'], inplace=True)
    columns_to_clean = ['Flavors', 'Aromas', 'May Relieve', 'Effects']
    for column in columns_to_clean:
        if column in df.columns:
            df[column] = clean_pipe_columns_series(df[column])
        else:
            df[column] = 'Unknown'
    if 'Type' in df.columns:
        df['Type'] = standardize_type_series(df['Type'])
    else:
        df['Type'] = 'Unknown'
    df.reset_index(drop=True, inplace=True)
    df['strain_id'] = df.index  # Ensure strain_id is an integer
    return df

# -----------------------------
# Step 7: Process mappings and create additional fields
# -----------------------------
file_path = '.venv/cleaned_and_reordered_data.csv'
strain_dataframe = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
strain_dataframe_cleaned = clean_strain_dataframe(strain_dataframe)

tqdm.pandas(desc="Applying Mapping Functions")
strain_dataframe_cleaned['Cannabinoid Profile'] = strain_dataframe_cleaned['Strain Description'].progress_apply(map_cannabinoids)
strain_dataframe_cleaned['Terpene Profile'] = strain_dataframe_cleaned['Strain Description'].progress_apply(map_terpenes)

# -----------------------------
# Step 8: MultiLabelBinarizer for Categorical Data
# -----------------------------
def apply_multilabel_binarizer(df: pd.DataFrame, column: str) -> pd.DataFrame:
    mlb = MultiLabelBinarizer()
    df[column] = df[column].astype(str).apply(
        lambda x: [item.strip().lower() for item in x.split(", ") if item.strip().lower() != 'unknown']
    )
    transformed = mlb.fit_transform(df[column])
    return pd.DataFrame(transformed, columns=[f"{column}_{cls}" for cls in mlb.classes_], index=df.index)

categorical_columns = ['Flavors', 'Aromas', 'Effects', 'May Relieve', 'Type', 'Cannabinoid Profile', 'Terpene Profile']

binarized_dfs = {}
for column in tqdm(categorical_columns, desc="Applying MultiLabelBinarizer"):
    binarized_dfs[column] = apply_multilabel_binarizer(strain_dataframe_cleaned, column)

strain_dataframe_encoded = pd.concat([strain_dataframe_cleaned] + list(binarized_dfs.values()), axis=1)

# -----------------------------
# Step 9: Drop Redundant Columns and Final Clean-up
# -----------------------------
columns_to_drop = categorical_columns + ['Strain Description']
strain_dataframe_final = strain_dataframe_encoded.drop(columns=columns_to_drop, errors='ignore')

# Pre-emptively handle inf and NaN values
strain_dataframe_final.replace([np.inf, -np.inf], np.nan, inplace=True)
strain_dataframe_final.fillna(0, inplace=True)

# Verify that strain_id remains an integer
strain_dataframe_final['strain_id'] = strain_dataframe_final['strain_id'].astype(int)

# -----------------------------
# Step 10: Generate BERT Embeddings (Optimized)
# -----------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name).to(device)

def get_bert_embeddings(texts, tokenizer, model, device, max_length=128, batch_size=32):
    all_embeddings = []
    progress_bar = tqdm(total=len(texts), desc="Generating BERT Embeddings", unit="text", leave=True)
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Ensure all texts are strings
        batch_texts = [str(text) for text in batch_texts]
        
        # Batch tokenize the input texts
        encoded_input = tokenizer(
            batch_texts, 
            padding=True, 
            truncation=True, 
            max_length=max_length, 
            return_tensors='pt'
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**encoded_input)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embedding

        all_embeddings.extend(embeddings)
        progress_bar.update(len(batch_texts))
    
    progress_bar.close()
    return np.array(all_embeddings, dtype=np.float32)

# Ensure that the texts are correctly formatted
texts = strain_dataframe_cleaned['Strain Description'].fillna('').astype(str).tolist()

# Generate BERT embeddings
bert_embeddings = get_bert_embeddings(
    texts,
    tokenizer, 
    bert_model, 
    device, 
    max_length=128,  
    batch_size=64    
)

# -----------------------------
# Step 11: Cosine Similarity Matrix and Save Output
# -----------------------------
cosine_sim_matrix = cosine_similarity(bert_embeddings)
output_file_path = '.venv/cleaned_strain_data_final_with_embeddings.csv'

# Save the embeddings into the dataframe
embedding_columns = [f"embedding_{i}" for i in range(bert_embeddings.shape[1])]
embeddings_df = pd.DataFrame(bert_embeddings, columns=embedding_columns)
strain_dataframe_final_with_embeddings = pd.concat([strain_dataframe_final.reset_index(drop=True), embeddings_df], axis=1)

strain_dataframe_final_with_embeddings.to_csv(output_file_path, index=False)

cosine_similarity_output_path = '.venv/strain_cosine_similarity_matrix.csv'
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=strain_dataframe_cleaned['strain_id'], columns=strain_dataframe_cleaned['strain_id'])
cosine_sim_df.to_csv(cosine_similarity_output_path)

# -----------------------------
# Step 12: FAISS for Top-N Similarities
# -----------------------------
# Normalize embeddings for cosine similarity
faiss.normalize_L2(bert_embeddings)
index = faiss.IndexFlatIP(bert_embeddings.shape[1])
index.add(bert_embeddings)

TOP_N = 10
distances, indices = index.search(bert_embeddings, TOP_N + 1)
similarity_data = []

for i in tqdm(range(len(strain_dataframe_cleaned)), desc="Finding Top-N Similarities"):
    strain_id = strain_dataframe_cleaned['strain_id'].iloc[i]
    for j in range(1, TOP_N + 1):
        similar_strain_id = strain_dataframe_cleaned['strain_id'].iloc[indices[i][j]]
        similarity_score = distances[i][j]
        similarity_data.append({
            'strain_id': strain_id,
            'similar_strain_id': similar_strain_id,
            'similarity_score': similarity_score
        })

similarity_output_path = '.venv/strain_cosine_similarity_matrix_top10.csv'
pd.DataFrame(similarity_data).to_csv(similarity_output_path, index=False)

# -----------------------------
# Final confirmation of results
# -----------------------------
print(strain_dataframe_final_with_embeddings.tail())

# Ensure any remaining tensor resources are cleaned up
torch.cuda.empty_cache()

NameError: name 'warnings' is not defined

In [3]:
import faker

CONFIG = {
    'num_profiles': 150000,
    'min_ratings_per_user': 50,
    'max_ratings_per_user': 300,
    'total_ratings': 20000000,
    'personas_filepath': '.venv/user_profiles_template.csv',
    'reviews_filepath': '.venv/review_templates.csv',
    'output_filepath': '.venv/synthetic_profiles_with_reviews.csv',
    'strain_data_filepath': '.venv/cleaned_strain_data_final.csv',
    'chemical_data': {
        'terpene_info': '.venv/terpene_info.json',
        'cannabinoid_info': '.venv/cannabinoid_info.json'
    },
    'redis': {
        'host': 'localhost',
        'port': 6379,
        'db': 0,
        'retries': 5
    },
    'logging': {
        'level': 'INFO',
        'file': '.venv/synthetic_data_generation.log'
    },
    'parallel': {
        'processes': multiprocessing.cpu_count(),
        'chunk_size': 1000
    }
}

def configure_logging(log_level: str, log_file: str):
    numeric_level = getattr(logging, log_level.upper(), None)
    if not isinstance(numeric_level, int):
        numeric_level = logging.WARNING

    logging.basicConfig(
        level=numeric_level,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

configure_logging(CONFIG['logging']['level'], CONFIG['logging']['file'])
logger = logging.getLogger(__name__)

fake = Faker()
fake_unique = Faker()
fake_unique.unique.clear()

def download_nltk_data():
    try:
        nltk.download('wordnet', quiet=True)
        nltk.download('omw-1.4', quiet=True)
    except Exception as e:
        raise

download_nltk_data()

def load_strain_data(filepath: str) -> pd.DataFrame:
    try:
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Strain data file not found: {filepath}")

        strain_dataframe = pd.read_csv(filepath)

        if 'strain_id' not in strain_dataframe.columns:
            strain_dataframe.reset_index(drop=True, inplace=True)
            strain_dataframe['strain_id'] = strain_dataframe.index + 1

        strain_dataframe.drop_duplicates(subset='strain_id', inplace=True)

        return strain_dataframe

    except FileNotFoundError as e:
        raise
    except pd.errors.ParserError as e:
        raise

strain_data = load_strain_data(CONFIG['strain_data_filepath'])
strain_data_list = strain_data[['strain_id', 'Strain_Name']].to_dict('records')
strain_name_to_id = {row['Strain_Name']: row['strain_id'] for row in strain_data_list}

def initialize_redis(host: str, port: int, db: int, retries: int):
    retry_count = 0
    while retry_count < retries:
        try:
            pool = redis.ConnectionPool(host=host, port=port, db=db, decode_responses=True)
            cache = redis.StrictRedis(connection_pool=pool)
            cache.ping()
            return cache
        except redis.ConnectionError:
            retry_count += 1
    return None

redis_client = initialize_redis(
    host=CONFIG['redis']['host'],
    port=CONFIG['redis']['port'],
    db=CONFIG['redis']['db'],
    retries=CONFIG['redis']['retries']
)

def load_json_file(filepath: str) -> Dict:
    try:
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"File not found: {filepath}")

        with open(filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except FileNotFoundError as e:
        raise
    except json.JSONDecodeError:
        raise

def validate_terpene_info(data: Dict):
    required_keys = {'taste_profile', 'effects'}
    for terp, details in data.items():
        if not required_keys.issubset(details.keys()):
            missing = required_keys - set(details.keys())
            raise ValueError(f"Terpene '{terp}' is missing required keys: {missing}")

def load_chemical_data(terpene_filepath: str, cannabinoid_filepath: str) -> Dict:
    terpene_info = load_json_file(terpene_filepath)
    validate_terpene_info(terpene_info)
    cannabinoid_info = load_json_file(cannabinoid_filepath)
    return {'terpenes': terpene_info, 'cannabinoids': cannabinoid_info}

chemical_data = load_chemical_data(
    CONFIG['chemical_data']['terpene_info'],
    CONFIG['chemical_data']['cannabinoid_info']
)

def extract_attributes(chemical_data: Dict) -> Dict:
    all_terpenes = list(chemical_data['terpenes'].keys())
    all_flavors = list(set(
        flavor.lower()
        for terpene in chemical_data['terpenes'].values()
        for flavor in terpene.get('taste_profile', [])
    ))
    all_effects = list(set(
        effect.lower()
        for terpene in chemical_data['terpenes'].values()
        for effect in terpene.get('effects', [])
    ))
    return {
        'terpenes': all_terpenes,
        'flavors': all_flavors,
        'effects': all_effects
    }

extracted_attributes = extract_attributes(chemical_data)

def encode_categorical_features(df: pd.DataFrame, categorical_columns: List[str]) -> pd.DataFrame:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_features = encoder.fit_transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))
    encoded_df.index = df.index
    df = df.drop(columns=categorical_columns)
    df = pd.concat([df, encoded_df], axis=1)
    return df

defined_attributes = {
    'available_states': ['California', 'Colorado', 'Oregon', 'Washington', 'Nevada', 'Arizona'],
    'usage_frequencies': ['Daily', 'Weekly', 'Monthly', 'Occasionally'],
    'strain_types': ['Indica', 'Sativa', 'Hybrid'],
    'consumption_methods': ['Smoking', 'Vaporizing', 'Edibles', 'Tinctures', 'Topicals'],
    'product_type_preferences': ['Edibles', 'Flower', 'Tinctures', 'Concentrates', 'Capsules'],
    'consumption_times': ['Morning', 'Afternoon', 'Evening', 'Night'],
    'social_contexts': ['Alone', 'With Friends', 'Social Gatherings'],
    'genders': ['male', 'female', 'non-binary', 'other']
}

def load_and_expand_personas(filepath: str, attributes: Dict, min_ratings: int, max_ratings: int, total_ratings: int) -> pd.DataFrame:
    try:
        if os.path.exists(filepath):
            personas = pd.read_csv(filepath)
        else:
            personas = pd.DataFrame()

        fake_unique = Faker()
        fake_unique.unique.clear()

        required_columns = ['user_id', 'user_name', 'age', 'gender', 'location', 'usage_frequency',
                            'consumption_time', 'social_context', 'preferred_strain_type',
                            'product_type_preference', 'num_ratings']
        for col in required_columns:
            if col not in personas.columns:
                if col == 'user_id':
                    personas['user_id'] = range(1, len(personas) + 1)
                elif col == 'user_name':
                    personas['user_name'] = [fake_unique.unique.name() for _ in range(len(personas))]
                elif col == 'num_ratings':
                    personas['num_ratings'] = [random.randint(min_ratings, max_ratings) for _ in range(len(personas))]
                else:
                    key = col + 's' if col + 's' in attributes else col
                    if key in attributes:
                        personas[col] = [random.choice(attributes[key]) for _ in range(len(personas))]
                    else:
                        personas[col] = 'Unknown'

        if 'num_ratings' in personas.columns:
            personas['num_ratings'] = personas['num_ratings'].apply(
                lambda x: random.randint(min_ratings, max_ratings) if pd.isnull(x) else int(x)
            ).astype(int)

        additional_personas = []
        current_count = len(personas)

        generated_ratings = personas['num_ratings'].sum()
        user_id_counter = personas['user_id'].max() + 1 if current_count > 0 else 1

        while generated_ratings < total_ratings:
            try:
                user_name = fake_unique.unique.name()
            except faker.exceptions.UniquenessException:
                fake_unique.unique.clear()
                user_name = fake_unique.unique.name()

            num_ratings = random.randint(min_ratings, max_ratings)

            if generated_ratings + num_ratings > total_ratings:
                num_ratings = total_ratings - generated_ratings

            generated_ratings += num_ratings

            additional_personas.append({
                'user_id': user_id_counter,
                'user_name': user_name,
                'age': random.randint(18, 65),
                'gender': random.choice(attributes['genders']),
                'location': random.choice(attributes['available_states']),
                'usage_frequency': random.choice(attributes['usage_frequencies']),
                'consumption_time': random.choice(attributes['consumption_times']),
                'social_context': random.choice(attributes['social_contexts']),
                'preferred_strain_type': random.choice(attributes['strain_types']),
                'product_type_preference': random.choice(attributes['product_type_preferences']),
                'num_ratings': num_ratings
            })

            user_id_counter += 1

        additional_personas_df = pd.DataFrame(additional_personas)
        personas = pd.concat([personas, additional_personas_df], ignore_index=True)

        categorical_columns = ['preferred_strain_type', 'consumption_time', 'usage_frequency', 'product_type_preference', 'social_context', 'gender']
        personas = encode_categorical_features(personas, categorical_columns)

        return personas
    except FileNotFoundError as e:
        raise
    except pd.errors.ParserError as e:
        raise
    except Exception as e:
        raise

personas = load_and_expand_personas(
    filepath=CONFIG['personas_filepath'],
    attributes=defined_attributes,
    min_ratings=CONFIG['min_ratings_per_user'],
    max_ratings=CONFIG['max_ratings_per_user'],
    total_ratings=CONFIG['total_ratings']
)

def load_review_templates(filepath: str) -> pd.DataFrame:
    try:
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Review templates file not found: {filepath}")

        reviews_df = pd.read_csv(filepath)
        return reviews_df
    except FileNotFoundError as e:
        raise
    except pd.errors.ParserError as e:
        raise

reviews_df = load_review_templates(CONFIG['reviews_filepath'])

def get_synonym(word: str) -> str:
    synonyms = wordnet.synsets(word)
    if synonyms:
        synonym = synonyms[0].lemmas()[0].name()
        if synonym.lower() != word.lower():
            return synonym.replace('_', ' ')
    return word

def augment_review_text(review_text: str) -> str:
    words = review_text.split()
    augmented = []
    for word in words:
        if random.random() < 0.2:
            synonym = get_synonym(word)
            augmented.append(synonym if synonym else word)
        else:
            augmented.append(word)
    return ' '.join(augmented)

intro_templates = [
    "I recently tried {strain_name} using {consumption_method}.",
    "My experience with {strain_name} through {consumption_method} was quite {adjective}.",
    "Using {consumption_method}, I found {strain_name} to be {adjective}.",
    "After using {consumption_method} with {strain_name}, I felt {feeling}."
]

body_templates = [
    "{rating_comment}",
    "The effects were {terpene_effects} and {entourage_effect}.",
    "I felt {terpene_effects}, which made the experience {experience_description}.",
    "Overall, the strain provided {entourage_effect} effects."
]

conclusion_templates = [
    "It had a distinct {taste_profile} flavor, which I found very enjoyable.",
    "The {taste_profile} taste complemented the effects well.",
    "The flavor was {taste_profile}, enhancing the overall experience.",
    "I particularly liked the {taste_profile} flavor."
]

adjectives = ['pleasant', 'intense', 'mellow', 'strong', 'subtle']
feelings = ['relaxed', 'energized', 'uplifted', 'calm']
experience_descriptions = ['unforgettable', 'soothing', 'refreshing', 'invigorating']

def generate_combined_review(strain_name: str, consumption_method: str, rating_comment: str,
                             terpene_effects: str, entourage_effect: str, taste_profile: str) -> str:
    intro = random.choice(intro_templates).format(
        strain_name=strain_name,
        consumption_method=consumption_method,
        adjective=random.choice(adjectives),
        feeling=random.choice(feelings)
    )
    body = random.choice(body_templates).format(
        rating_comment=rating_comment,
        terpene_effects=terpene_effects,
        entourage_effect=entourage_effect,
        experience_description=random.choice(experience_descriptions)
    )
    conclusion = random.choice(conclusion_templates).format(
        taste_profile=taste_profile
    )
    return f"{intro} {body} {conclusion}"

def calculate_sentiment(rating: int, comment: str) -> float:
    try:
        analysis = TextBlob(comment)
        sentiment_polarity = analysis.sentiment.polarity
        adjusted_sentiment = sentiment_polarity + ((rating - 3) * 0.1)
        return max(min(adjusted_sentiment, 1.0), -1.0)
    except Exception as e:
        return 0.0

def generate_unique_cache_key(user_id: int, strain_id: int) -> str:
    return f"{user_id}_{strain_id}"

def main():
    global strain_data_list, reviews_df, extracted_attributes, defined_attributes, strain_name_to_id

    attributes = {**extracted_attributes, **defined_attributes}

    strain_list = strain_data_list

    personas['num_ratings'] = personas['num_ratings'].apply(
        lambda x: random.randint(CONFIG['min_ratings_per_user'], CONFIG['max_ratings_per_user']) if pd.isnull(x) else int(x)
    ).astype(int)

    total_reviews = personas['num_ratings'].sum()

    with open(CONFIG['output_filepath'], 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['user_id', 'user_name', 'strain_name', 'strain_id', 'rating', 'rating_comment', 'side_effects', 'taste_profile', 'review', 'sentiment_score']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        chunk_size = CONFIG['parallel']['chunk_size']
        persona_chunks = [personas.iloc[i:i + chunk_size] for i in range(0, len(personas), chunk_size)]

        for chunk_index, persona_chunk in enumerate(persona_chunks):
            tasks = []
            for _, row in persona_chunk.iterrows():
                profile_dict = row.to_dict()
                tasks.append(profile_dict)

            with ProcessPoolExecutor(max_workers=CONFIG['parallel']['processes']) as executor:
                futures = {executor.submit(process_user_reviews, profile): profile['user_id'] for profile in tasks}

                for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing Chunk {chunk_index + 1}"):
                    user_id = futures[future]
                    try:
                        user_reviews = future.result()
                        if user_reviews:
                            for review in user_reviews:
                                writer.writerow(review)
                    except Exception as e:
                        pass

def process_user_reviews(profile: Dict) -> List[Dict]:
    user_reviews = []
    try:
        num_ratings = int(profile['num_ratings'])
        user_id = profile.get('user_id', 'unknown')

        worker_redis_client = None

        for _ in range(num_ratings):
            strain = random.choice(strain_data_list)
            review = generate_synthetic_review(profile, strain, worker_redis_client)
            if review:
                user_reviews.append(review)
    except Exception as e:
        pass
    return user_reviews

def generate_synthetic_review(profile: Dict, strain: Dict, worker_redis_client=None) -> Dict:
    try:
        strain_name = strain.get('Strain_Name', 'Unknown Strain')
        strain_id = strain.get('strain_id') or strain_name_to_id.get(strain_name, None)

        if strain_id is None:
            return {}

        user_id = profile['user_id']

        selected_review = reviews_df.sample(1).iloc[0]
        consumption_method = random.choice(defined_attributes['consumption_methods'])
        sentiment_score = calculate_sentiment(selected_review['rating'], selected_review['rating_comment'])

        review_text = generate_combined_review(
            strain_name=strain_name,
            consumption_method=consumption_method,
            rating_comment=selected_review['rating_comment'],
            terpene_effects=random.choice(extracted_attributes['effects']),
            entourage_effect=random.choice(extracted_attributes['effects']),
            taste_profile=selected_review['taste_profile']
        )

        review_text = augment_review_text(review_text)

        review_dict = {
            'user_id': user_id,
            'user_name': profile['user_name'],
            'strain_name': strain_name,
            'strain_id': strain_id,
            'rating': int(selected_review['rating']),
            'rating_comment': selected_review['rating_comment'],
            'side_effects': selected_review.get('side_effects', ''),
            'taste_profile': selected_review['taste_profile'],
            'review': review_text,
            'sentiment_score': sentiment_score
        }

        return review_dict
    except Exception as e:
        return {}

if __name__ == "__main__":
    main()


NameError: name 'multiprocessing' is not defined

In [4]:
# collaborative_filtering_optimized.py

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import logging
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import copy
import os
from logging.handlers import RotatingFileHandler

# -----------------------------
# Step 1: Define Configuration Parameters
# -----------------------------
CONFIG_ALS = {
    'interaction_filepath': '.venv/synthetic_profiles_with_reviews.csv',  # Path to interaction data
    'als_model_filepath': '.venv/als_model.pth',  # Path to save the trained ALS model
    'als_test_data_filepath': '.venv/als_test_data.csv',  # Path to save the test dataset
    'strain_mapping_path': '.venv/mappings/strain_id_mapping.pkl',  # Path to strain mapping
    'strain_embeddings_path': '.venv/strain_embeddings.npy',  # Path to strain embeddings
    'logging': {
        'level': 'INFO',  # Logging level
        'file': '.venv/collaborative_filtering.log'  # Log file path
    },
    'training': {
        'latent_factors': 50,  # Number of latent factors
        'epochs': 100,  # Number of training epochs
        'learning_rate': 0.005,  # Learning rate for the optimizer
        # 'weight_decay': 1e-3,  # Weight decay (L2 regularization) -- Removed to fix the error
        'batch_size': 512,  # Batch size for training
        'patience': 10  # Early stopping patience
    }
}

# -----------------------------
# Step 2: Configure Logging
# -----------------------------
def configure_logging(log_level: str, log_file: str):
    """
    Configure logging settings with rotation to manage log file size.
    """
    numeric_level = getattr(logging, log_level.upper(), None)
    if not isinstance(numeric_level, int):
        numeric_level = logging.WARNING

    logger = logging.getLogger(__name__)

    # Prevent adding multiple handlers if they already exist
    if not logger.handlers:
        logger.setLevel(numeric_level)

        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')

        # Rotating file handler: 5MB per file, keep 5 backups
        file_handler = RotatingFileHandler(log_file, maxBytes=5*1024*1024, backupCount=5)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        # Console handler
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)

    return logger

logger = configure_logging(CONFIG_ALS['logging']['level'], CONFIG_ALS['logging']['file'])

# -----------------------------
# Step 3: Define ALS Model with Bias Terms (PyTorch)
# -----------------------------
class ALSModel(nn.Module):
    def __init__(self, num_users: int, num_strains: int, latent_factors: int = 10):
        """
        Initialize the ALS model with user and strain embeddings, including bias terms.
        """
        super(ALSModel, self).__init__()
        self.user_factors = nn.Embedding(num_users, latent_factors, sparse=True)  # Use sparse embeddings
        self.strain_factors = nn.Embedding(num_strains, latent_factors, sparse=True)
        self.user_bias = nn.Embedding(num_users, 1, sparse=True)
        self.strain_bias = nn.Embedding(num_strains, 1, sparse=True)

        # Initialize embeddings with a normal distribution
        nn.init.normal_(self.user_factors.weight, 0, 0.1)
        nn.init.normal_(self.strain_factors.weight, 0, 0.1)
        nn.init.constant_(self.user_bias.weight, 0)
        nn.init.constant_(self.strain_bias.weight, 0)

    def forward(self, user_indices, strain_indices):
        """
        Forward pass to compute the dot product of user and strain embeddings, including biases.
        """
        user_embedding = self.user_factors(user_indices)
        strain_embedding = self.strain_factors(strain_indices)
        user_bias = self.user_bias(user_indices).squeeze()
        strain_bias = self.strain_bias(strain_indices).squeeze()
        return torch.sum(user_embedding * strain_embedding, dim=1) + user_bias + strain_bias

# -----------------------------
# Step 4: Load and Preprocess Interaction Data
# -----------------------------
def load_interaction_data(filepath: str) -> pd.DataFrame:
    """
    Load user-item interaction data from a CSV file and preprocess it.
    """
    if not os.path.exists(filepath):
        logger.error(f"Interaction data file does not exist at path: {filepath}")
        raise FileNotFoundError(f"File not found: {filepath}")

    try:
        # Specify dtypes to optimize memory usage
        dtype = {'user_id': 'int32', 'strain_id': 'int32', 'rating': 'float32'}
        df = pd.read_csv(filepath, usecols=['user_id', 'strain_id', 'rating'], dtype=dtype)
        expected_columns = {'user_id', 'strain_id', 'rating'}
        if not expected_columns.issubset(df.columns):
            missing = expected_columns - set(df.columns)
            logger.error(f"Interaction data is missing columns: {missing}")
            raise ValueError(f"Missing columns in interaction data: {missing}")
        logger.info(f"Loaded interaction data from {filepath} with shape {df.shape}")
    except pd.errors.ParserError:
        logger.error(f"Error parsing CSV file: {filepath}")
        raise

    # Handle multiple ratings per user-strain pair by averaging
    if df.duplicated(subset=['user_id', 'strain_id']).any():
        logger.info("Found duplicate user-strain interactions. Aggregating by mean rating.")
        df = df.groupby(['user_id', 'strain_id'], as_index=False)['rating'].mean()
        logger.info(f"Aggregated interaction data to shape {df.shape}")

    # Additional checks for missing or anomalous data
    if df['rating'].isnull().any():
        logger.warning("Found missing ratings. Filling with the mean rating.")
        mean_rating = df['rating'].mean()
        df['rating'].fillna(mean_rating, inplace=True)

    # Ensure ratings are within the expected range (e.g., 1-5)
    if df['rating'].min() < 1 or df['rating'].max() > 5:
        logger.warning("Ratings found outside the range of 1-5. Clipping to the nearest valid value.")
        df['rating'] = df['rating'].clip(lower=1, upper=5)

    return df

# -----------------------------
# Step 5: Train ALS Model
# -----------------------------
def train_als_model(interaction_df: pd.DataFrame, latent_factors: int = 10, epochs: int = 20, lr: float = 0.001,
                   batch_size: int = 1024, patience: int = 10):
    """
    Train the ALS model on user-item interaction data.
    """
    logger.info("Preparing data for ALS model training...")

    # Convert user_id and strain_id to categorical codes with mapping
    interaction_df['user_id'] = interaction_df['user_id'].astype('category')
    interaction_df['strain_id'] = interaction_df['strain_id'].astype('category')

    # Create mappings
    user_categories = interaction_df['user_id'].cat.categories
    strain_categories = interaction_df['strain_id'].cat.categories

    user_mapping = {category: code for code, category in enumerate(user_categories)}
    strain_mapping = {category: code for code, category in enumerate(strain_categories)}

    # Encode IDs
    interaction_df['user_id_code'] = interaction_df['user_id'].cat.codes
    interaction_df['strain_id_code'] = interaction_df['strain_id'].cat.codes

    num_users = len(user_categories)
    num_strains = len(strain_categories)

    logger.info(f"Number of unique users: {num_users}, Number of unique strains: {num_strains}")

    # Save mappings for future use
    os.makedirs('.venv/mappings', exist_ok=True)  # Create a directory for mappings
    with open('.venv/mappings/user_id_mapping.pkl', 'wb') as f:
        pickle.dump(user_mapping, f)
    logger.info("Saved user_id mapping to '.venv/mappings/user_id_mapping.pkl'")

    with open('.venv/mappings/strain_id_mapping.pkl', 'wb') as f:
        pickle.dump(strain_mapping, f)
    logger.info("Saved strain_id mapping to '.venv/mappings/strain_id_mapping.pkl'")

    # Split data into training, validation, and testing sets (70-15-15 split)
    train_df, temp_df = train_test_split(interaction_df, test_size=0.3, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    logger.info(f"Training data size: {train_df.shape}, Validation data size: {val_df.shape}, Test data size: {test_df.shape}")

    # Convert data to PyTorch tensors (on CPU)
    train_users = torch.tensor(train_df['user_id_code'].values, dtype=torch.long)
    train_strains = torch.tensor(train_df['strain_id_code'].values, dtype=torch.long)
    train_ratings = torch.tensor(train_df['rating'].values, dtype=torch.float32)

    val_users = torch.tensor(val_df['user_id_code'].values, dtype=torch.long)
    val_strains = torch.tensor(val_df['strain_id_code'].values, dtype=torch.long)
    val_ratings = torch.tensor(val_df['rating'].values, dtype=torch.float32)

    test_users = torch.tensor(test_df['user_id_code'].values, dtype=torch.long)
    test_strains = torch.tensor(test_df['strain_id_code'].values, dtype=torch.long)
    test_ratings = torch.tensor(test_df['rating'].values, dtype=torch.float32)

    # Create DataLoader with batching
    train_dataset = TensorDataset(train_users, train_strains, train_ratings)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
    logger.info("Created DataLoader for training.")

    # Initialize ALS model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Using device: {device}")

    model = ALSModel(num_users=num_users, num_strains=num_strains, latent_factors=latent_factors).to(device)
    # Use SparseAdam optimizer for sparse embeddings without weight_decay
    optimizer = optim.SparseAdam(model.parameters(), lr=lr)  # Removed weight_decay
    criterion = nn.MSELoss()

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    best_rmse = float('inf')
    best_model_state = copy.deepcopy(model.state_dict())
    counter = 0  # Early stopping counter

    for epoch in range(epochs):
        epoch_loss = 0.0
        model.train()  # Ensure model is in training mode

        for batch_users, batch_strains, batch_ratings in train_loader:
            # Move data to device
            batch_users = batch_users.to(device)
            batch_strains = batch_strains.to(device)
            batch_ratings = batch_ratings.to(device)

            optimizer.zero_grad()
            predictions = model(batch_users, batch_strains)
            loss = criterion(predictions, batch_ratings)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() * batch_users.size(0)

        avg_loss = epoch_loss / len(train_dataset)
        logger.info(f"Epoch [{epoch + 1}/{epochs}], Loss: {avg_loss:.4f}")

        # Evaluate on validation set
        rmse_val = evaluate_als_model(model, val_users, val_strains, val_ratings, criterion, device)

        # Step the scheduler
        scheduler.step(rmse_val)

        # Early Stopping Check
        if rmse_val < best_rmse:
            best_rmse = rmse_val
            best_model_state = copy.deepcopy(model.state_dict())
            counter = 0
            logger.info(f"New best RMSE: {best_rmse:.4f} at epoch {epoch + 1}")
        else:
            counter += 1
            logger.info(f"No improvement in RMSE for {counter} epochs.")
            if counter >= patience:
                logger.info("Early stopping triggered.")
                break

    # Load best model state
    model.load_state_dict(best_model_state)
    logger.info(f"Best RMSE achieved: {best_rmse:.4f}")

    # Extract and save embeddings
    extract_and_save_embeddings(model)

    return model, test_df, test_users, test_strains, test_ratings

# -----------------------------
# Step 6: Evaluate ALS Model
# -----------------------------
def evaluate_als_model(model, users, strains, ratings, criterion, device):
    """
    Evaluate the ALS model on the validation set and compute RMSE.
    """
    model.eval()
    with torch.no_grad():
        users = users.to(device)
        strains = strains.to(device)
        ratings = ratings.to(device)
        predictions = model(users, strains)
        predictions = torch.clamp(predictions, min=1, max=5)  # Ensure predictions are in valid range
        mse = criterion(predictions, ratings).item()
        rmse = np.sqrt(mse)
        logger.info(f"Validation RMSE: {rmse:.4f}")
        return rmse

# -----------------------------
# Step 7: Save ALS Model
# -----------------------------
def save_als_model(model, filepath: str):
    """
    Save the trained ALS model's state dictionary.
    """
    try:
        torch.save(model.state_dict(), filepath)
        logger.info(f"ALS model saved to '{filepath}'.")
    except Exception as e:
        logger.error(f"Failed to save ALS model to '{filepath}': {e}")

# -----------------------------
# Step 8: Save Test Data
# -----------------------------
def save_test_data(test_df: pd.DataFrame, filepath: str = '.venv/als_test_data.csv'):
    """
    Save the test data used for evaluation to a CSV file.
    """
    try:
        # Load mappings
        with open('.venv/mappings/user_id_mapping.pkl', 'rb') as f:
            user_mapping = pickle.load(f)
        with open('.venv/mappings/strain_id_mapping.pkl', 'rb') as f:
            strain_mapping = pickle.load(f)

        inverse_user_mapping = {v: k for k, v in user_mapping.items()}
        inverse_strain_mapping = {v: k for k, v in strain_mapping.items()}

        test_df_output = pd.DataFrame({
            'user_id': test_df['user_id_code'].map(inverse_user_mapping),
            'strain_id': test_df['strain_id_code'].map(inverse_strain_mapping),
            'rating': test_df['rating'].values
        })
        test_df_output.to_csv(filepath, index=False)
        logger.info(f"Saved ALS test data to '{filepath}'.")
    except Exception as e:
        logger.error(f"Failed to save test data to '{filepath}': {e}")

# -----------------------------
# Step 9: Extract and Save Embeddings
# -----------------------------
def extract_and_save_embeddings(model):
    """
    Extract user and strain embeddings from the trained model and save them as numpy arrays.
    """
    try:
        user_embeddings = model.user_factors.weight.detach().cpu().numpy()
        strain_embeddings = model.strain_factors.weight.detach().cpu().numpy()

        np.save('.venv/user_embeddings.npy', user_embeddings)
        np.save('.venv/strain_embeddings.npy', strain_embeddings)
        logger.info("User and strain embeddings saved to '.venv/user_embeddings.npy' and '.venv/strain_embeddings.npy'.")
    except Exception as e:
        logger.error(f"Failed to extract and save embeddings: {e}")

# -----------------------------
# Step 10: Generate CF Predictions (Optional)
# -----------------------------
def generate_cf_predictions(model, num_users, num_strains):
    """
    Generate CF predictions by computing the dot product of user and strain embeddings.
    """
    model.eval()
    with torch.no_grad():
        user_factors = model.user_factors.weight.detach().cpu()
        strain_factors = model.strain_factors.weight.detach().cpu()
        cf_predictions = torch.matmul(user_factors, strain_factors.t()).numpy()
    return cf_predictions

# -----------------------------
# Step 11: Main Execution for ALS
# -----------------------------
def main_als():
    """
    Main function to execute the ALS training pipeline.
    """
    try:
        # Step 1: Load interaction data
        interaction_df = load_interaction_data(CONFIG_ALS['interaction_filepath'])

        # Step 2: Train ALS model
        model_cf, test_df, test_users, test_strains, test_ratings = train_als_model(
            interaction_df,
            latent_factors=CONFIG_ALS['training']['latent_factors'],
            epochs=CONFIG_ALS['training']['epochs'],
            lr=CONFIG_ALS['training']['learning_rate'],
            batch_size=CONFIG_ALS['training']['batch_size'],
            patience=CONFIG_ALS['training']['patience']
        )

        # Step 3: Save ALS model
        save_als_model(model_cf, CONFIG_ALS['als_model_filepath'])

        # Step 4: Save ALS test data
        save_test_data(test_df, CONFIG_ALS['als_test_data_filepath'])

        # Optional Step 5: Generate CF predictions
        # Uncomment if needed in the future
        # cf_predictions = generate_cf_predictions(model_cf, len(user_mapping), len(strain_mapping))
        # np.save('.venv/cf_predictions.npy', cf_predictions)
        # logger.info("CF predictions saved to '.venv/cf_predictions.npy'.")

    except Exception as e:
        logger.error(f"An unexpected error occurred during ALS training: {e}")

if __name__ == "__main__":
    main_als()


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Users/Ken/OneDrive/Desktop/Introduction to Statistical Learning/CapstoneV2/notebooks/.venv/collaborative_filtering.log'

In [44]:
# ------------------------------------------
# Deep Hybrid Recommender System
# Comprehensive Professional-Grade Implementation
# Refined for Enhanced Scalability and Performance with ROCm GPU Support
# ------------------------------------------

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os
import sys
import logging
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import shap  # SHAP for explainability
import faiss  # Fast retrieval for real-time recommendations
from typing import Tuple, Dict, Any, List, Set

# -----------------------------
# Configuration Section
# -----------------------------
class Config:
    # File paths
    LOG_FILE = '.venv/deep_hybrid_recommender_overhauled.log'
    USER_EMB_PATH = '.venv/user_embeddings.npy'
    STRAIN_EMB_PATH = '.venv/strain_embeddings.npy'
    STRAIN_DATA_PATH = '.venv/cleaned_strain_data_final_with_embeddings.csv'
    INTERACTION_DATA_PATH = '.venv/synthetic_profiles_with_reviews.csv'
    USER_MAPPING_PATH = '.venv/mappings/user_id_mapping.pkl'
    STRAIN_MAPPING_PATH = '.venv/mappings/strain_id_mapping.pkl'
    BEST_MODEL_PATH = '.venv/best_hybrid_model.pth'
    SHAP_PLOT_PATH = '.venv/shap_summary_plot.png'
    LOSS_PLOT_PATH = '.venv/loss_plot.png'
    PRECISION_PLOT_PATH = '.venv/precision_plot.png'
    RECALL_PLOT_PATH = '.venv/recall_plot.png'
    PCA_USER_EMB_PATH = '.venv/pca_user_embeddings.pkl'
    PCA_STRAIN_EMB_PATH = '.venv/pca_strain_embeddings.pkl'
    PCA_CBF_EMB_PATH = '.venv/pca_cbf_embeddings.pkl'  # Updated path for CBF PCA

    # Training parameters
    BATCH_SIZE = 256  # Consider increasing to 512 based on GPU memory availability
    NUM_WORKERS = 8  # Increased to match CPU cores for better utilization
    EPOCHS = 12
    LEARNING_RATE = 0.0005
    PATIENCE = 4
    K = 10  # For Precision@K, Recall@K, NDCG@K

    # PCA parameters (Desired components)
    DESIRED_PCA_USER_COMPONENTS = 128
    DESIRED_PCA_STRAIN_COMPONENTS = 128
    DESIRED_PCA_CBF_COMPONENTS = 64

    # Random seed for reproducibility
    RANDOM_STATE = 42

    # Faiss parameters
    FAISS_INDEX = 'Flat'  # Options: 'Flat', 'IVFFlat', 'IVFPQ'

# -----------------------------
# Step 0: Configure Logging
# -----------------------------
def configure_logging(log_file_path: str) -> logging.Logger:
    """
    Sets up logging to track the system's behavior.
    Logs are written to both console and a log file with rotation to prevent unlimited growth.
    """
    logger = logging.getLogger('DeepHybridRecommender')
    logger.setLevel(logging.INFO)

    # Prevent adding multiple handlers if the logger already has them
    if not logger.handlers:
        # Console handler
        c_handler = logging.StreamHandler()
        c_handler.setLevel(logging.INFO)

        # File handler with rotation (max 5 files, each up to 5MB)
        f_handler = logging.handlers.RotatingFileHandler(log_file_path, maxBytes=5*1024*1024, backupCount=5)
        f_handler.setLevel(logging.INFO)

        # Formatter
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        c_handler.setFormatter(formatter)
        f_handler.setFormatter(formatter)

        # Add handlers
        logger.addHandler(c_handler)
        logger.addHandler(f_handler)

    logger.info("Logging is configured.")
    return logger

logger = configure_logging(Config.LOG_FILE)

# -----------------------------
# Step 1: Load ALS Embeddings
# -----------------------------
def load_embeddings(user_emb_path: str, strain_emb_path: str, logger: logging.Logger) -> Tuple[np.ndarray, np.ndarray]:
    """
    Load ALS embeddings from specified file paths.
    Handles missing files and NaN values.
    """
    logger.info("Loading ALS embeddings...")
    for path in [user_emb_path, strain_emb_path]:
        if not os.path.exists(path):
            logger.error(f"Embedding file not found at path: {path}")
            sys.exit(1)

    try:
        user_embeddings = np.load(user_emb_path, mmap_mode='r')
        strain_embeddings = np.load(strain_emb_path, mmap_mode='r')
    except Exception as e:
        logger.error(f"Error loading embeddings: {e}")
        sys.exit(1)

    # Log embeddings' shapes
    logger.info(f"User Embeddings Shape: {user_embeddings.shape}")
    logger.info(f"Strain Embeddings Shape: {strain_embeddings.shape}")

    # Handle NaNs
    if np.isnan(user_embeddings).any() or np.isnan(strain_embeddings).any():
        logger.warning("Found NaN values in ALS embeddings. Replacing NaNs with zeros.")
        user_embeddings = np.nan_to_num(user_embeddings)
        strain_embeddings = np.nan_to_num(strain_embeddings)

    logger.info(f"Loaded user embeddings shape after NaN handling: {user_embeddings.shape}")
    logger.info(f"Loaded strain embeddings shape after NaN handling: {strain_embeddings.shape}")
    return user_embeddings, strain_embeddings

user_embeddings, strain_embeddings = load_embeddings(
    Config.USER_EMB_PATH, 
    Config.STRAIN_EMB_PATH, 
    logger
)

# -----------------------------
# Step 2: Load Content-based Data
# -----------------------------
def load_strain_data(strain_data_path: str, logger: logging.Logger) -> Tuple[pd.DataFrame, np.ndarray]:
    """
    Load content-based strain data and extract embeddings.
    Handles missing files and NaN values.
    """
    logger.info("Loading content-based strain data...")
    if not os.path.exists(strain_data_path):
        logger.error(f"Strain data file not found at path: {strain_data_path}")
        sys.exit(1)

    try:
        strain_data = pd.read_csv(strain_data_path)
    except Exception as e:
        logger.error(f"Error loading strain data: {e}")
        sys.exit(1)

    # Handle NaNs in strain data
    if strain_data.isnull().values.any():
        logger.warning("Found NaN values in strain data. Replacing NaNs with zeros.")
        strain_data.fillna(0, inplace=True)

    # Extract embedding columns dynamically
    embedding_columns = [col for col in strain_data.columns if col.startswith('embedding_')]
    if not embedding_columns:
        logger.error("No embedding columns found in strain data.")
        sys.exit(1)

    strain_embeddings_cbf = strain_data[embedding_columns].values

    # Handle NaNs in embeddings
    if np.isnan(strain_embeddings_cbf).any():
        logger.warning("Found NaN values in content-based embeddings. Replacing NaNs with zeros.")
        strain_embeddings_cbf = np.nan_to_num(strain_embeddings_cbf)

    logger.info(f"Extracted content-based embeddings shape: {strain_embeddings_cbf.shape}")
    return strain_data, strain_embeddings_cbf

strain_data, strain_embeddings_cbf = load_strain_data(Config.STRAIN_DATA_PATH, logger)

# -----------------------------
# Step 3: Load Interaction Data (Ratings)
# -----------------------------
def load_interaction_data(interaction_data_path: str, logger: logging.Logger) -> pd.DataFrame:
    """
    Load user interaction data, such as ratings, and clean it.
    Handles missing files and NaN values.
    """
    logger.info("Loading interaction data...")
    if not os.path.exists(interaction_data_path):
        logger.error(f"Interaction data file not found at path: {interaction_data_path}")
        sys.exit(1)

    try:
        interaction_data = pd.read_csv(interaction_data_path, usecols=['user_id', 'strain_id', 'rating'])
    except Exception as e:
        logger.error(f"Error loading interaction data: {e}")
        sys.exit(1)

    initial_shape = interaction_data.shape
    interaction_data.dropna(inplace=True)  # Ensure no missing ratings
    if interaction_data.shape != initial_shape:
        logger.warning(f"Dropped {initial_shape[0] - interaction_data.shape[0]} rows due to NaNs in interaction data.")

    # Filter out invalid ratings if necessary
    valid_rating_mask = (interaction_data['rating'] >= 1) & (interaction_data['rating'] <= 5)
    if not valid_rating_mask.all():
        num_invalid = (~valid_rating_mask).sum()
        logger.warning(f"Found {num_invalid} invalid ratings. Dropping these rows.")
        interaction_data = interaction_data[valid_rating_mask]

    logger.info(f"Loaded interaction data shape: {interaction_data.shape}")
    return interaction_data

interaction_data = load_interaction_data(Config.INTERACTION_DATA_PATH, logger)

# -----------------------------
# Step 4: Load and Verify Mappings
# -----------------------------
def load_mappings(user_mapping_path: str, strain_mapping_path: str, logger: logging.Logger) -> Tuple[Dict[Any, int], Dict[Any, int]]:
    """
    Load user and strain mappings from pickle files.
    Handles missing files and loading errors.
    """
    logger.info("Loading user and strain mappings...")
    for path in [user_mapping_path, strain_mapping_path]:
        if not os.path.exists(path):
            logger.error(f"Mapping file not found at path: {path}")
            sys.exit(1)

    try:
        with open(user_mapping_path, 'rb') as f:
            user_mapping = pickle.load(f)
        with open(strain_mapping_path, 'rb') as f:
            strain_mapping = pickle.load(f)
    except Exception as e:
        logger.error(f"Error loading mappings: {e}")
        sys.exit(1)

    logger.info(f"Loaded user_mapping with {len(user_mapping)} entries.")
    logger.info(f"Loaded strain_mapping with {len(strain_mapping)} entries.")
    return user_mapping, strain_mapping

user_mapping, strain_mapping = load_mappings(
    Config.USER_MAPPING_PATH, 
    Config.STRAIN_MAPPING_PATH, 
    logger
)

# -----------------------------
# Step 5: Map IDs and Ensure Data Integrity
# -----------------------------
def map_ids_and_verify(interaction_df: pd.DataFrame, 
                       user_mapping: Dict[Any, int], 
                       strain_mapping: Dict[Any, int], 
                       logger: logging.Logger) -> pd.DataFrame:
    """
    Map user and strain IDs to numerical codes and ensure data integrity.
    Drops rows with missing mappings.
    """
    logger.info("Mapping user_id and strain_id to codes...")
    interaction_df['user_id_code'] = interaction_df['user_id'].map(user_mapping)
    interaction_df['strain_id_code'] = interaction_df['strain_id'].map(strain_mapping)

    # Drop rows with missing mappings
    num_missing_users = interaction_df['user_id_code'].isnull().sum()
    num_missing_strains = interaction_df['strain_id_code'].isnull().sum()
    if num_missing_users > 0 or num_missing_strains > 0:
        logger.warning(f"Missing mappings: {num_missing_users} users, {num_missing_strains} strains")
        interaction_df.dropna(subset=['user_id_code', 'strain_id_code'], inplace=True)
        logger.info(f"Dropped rows with missing mappings. New shape: {interaction_df.shape}")

    # Ensure no NaNs after mapping
    if interaction_df[['user_id_code', 'strain_id_code']].isnull().any().any():
        logger.error("Found NaN values after mapping IDs. Please check your data.")
        sys.exit(1)

    # Convert to integer type
    interaction_df['user_id_code'] = interaction_df['user_id_code'].astype(int)
    interaction_df['strain_id_code'] = interaction_df['strain_id_code'].astype(int)
    logger.info("Completed mapping of IDs.")
    return interaction_df

interaction_data = map_ids_and_verify(
    interaction_data, 
    user_mapping, 
    strain_mapping, 
    logger
)

# -----------------------------
# Step 6: Split Data into Train and Validation Sets
# -----------------------------
def split_data(interaction_df: pd.DataFrame, 
              test_size: float = 0.2, 
              random_state: int = 42, 
              logger: logging.Logger = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split interaction data into training and validation sets.
    """
    if logger:
        logger.info("Splitting data into training and validation sets...")
    try:
        train_df, val_df = train_test_split(
            interaction_df, 
            test_size=test_size, 
            random_state=random_state
        )
    except Exception as e:
        if logger:
            logger.error(f"Error during train-test split: {e}")
        sys.exit(1)

    if logger:
        logger.info(f"Training data size: {train_df.shape[0]}, Validation data size: {val_df.shape[0]}")

    return train_df.reset_index(drop=True), val_df.reset_index(drop=True)

train_df, val_df = split_data(
    interaction_data, 
    test_size=0.2, 
    random_state=Config.RANDOM_STATE, 
    logger=logger
)

# -----------------------------
# Step 7: Apply PCA for Dimensionality Reduction
# -----------------------------
def apply_pca(embeddings: np.ndarray, desired_components: int, logger: logging.Logger, save_path: str) -> Tuple[np.ndarray, PCA]:
    """
    Applies PCA to reduce the dimensionality of embeddings.
    Dynamically sets n_components to min(desired_components, min(n_samples, n_features)).
    """
    n_samples, n_features = embeddings.shape
    actual_components = min(desired_components, min(n_samples, n_features))
    logger.info(f"Applying PCA: desired n_components={desired_components}, actual n_components={actual_components}")
    pca = PCA(n_components=actual_components, random_state=Config.RANDOM_STATE)
    reduced_embeddings = pca.fit_transform(embeddings)
    explained_variance = np.sum(pca.explained_variance_ratio_)
    logger.info(f"PCA completed. Reduced embeddings shape: {reduced_embeddings.shape}, Explained Variance: {explained_variance:.4f}")

    # Save PCA object for future transformations
    with open(save_path, 'wb') as f:
        pickle.dump(pca, f)
    logger.info(f"PCA model saved at {save_path}.")

    return reduced_embeddings, pca

def load_pca(pca_path: str, logger: logging.Logger) -> PCA:
    """
    Loads a saved PCA model.
    """
    if not os.path.exists(pca_path):
        logger.error(f"PCA file not found at path: {pca_path}")
        sys.exit(1)
    with open(pca_path, 'rb') as f:
        pca = pickle.load(f)
    logger.info(f"PCA model loaded from {pca_path}.")
    return pca

def apply_pca_cbf(embeddings_cbf: np.ndarray, desired_components: int, logger: logging.Logger, save_path: str) -> np.ndarray:
    """
    Applies PCA to reduce the dimensionality of content-based embeddings.
    """
    n_samples, n_features = embeddings_cbf.shape
    actual_components = min(desired_components, min(n_samples, n_features))
    logger.info(f"Applying PCA to content-based embeddings: desired n_components={desired_components}, actual n_components={actual_components}")
    pca_cbf = PCA(n_components=actual_components, random_state=Config.RANDOM_STATE)
    strain_embeddings_cbf_reduced = pca_cbf.fit_transform(embeddings_cbf)
    explained_variance = np.sum(pca_cbf.explained_variance_ratio_)
    logger.info(f"PCA applied to content-based embeddings. Reduced shape: {strain_embeddings_cbf_reduced.shape}, Explained Variance: {explained_variance:.4f}")

    # Save PCA for CBF embeddings
    with open(save_path, 'wb') as f:
        pickle.dump(pca_cbf, f)
    logger.info(f"PCA model for CBF embeddings saved at {save_path}.")

    return strain_embeddings_cbf_reduced

# Apply PCA to ALS embeddings
user_embeddings_reduced, pca_user = apply_pca(
    user_embeddings, 
    Config.DESIRED_PCA_USER_COMPONENTS, 
    logger, 
    Config.PCA_USER_EMB_PATH
)

strain_embeddings_reduced, pca_strain = apply_pca(
    strain_embeddings, 
    Config.DESIRED_PCA_STRAIN_COMPONENTS, 
    logger, 
    Config.PCA_STRAIN_EMB_PATH
)

# Apply PCA to content-based embeddings
strain_embeddings_cbf_reduced = apply_pca_cbf(
    strain_embeddings_cbf, 
    Config.DESIRED_PCA_CBF_COMPONENTS, 
    logger, 
    Config.PCA_CBF_EMB_PATH  # Updated to use the correct path
)

# -----------------------------
# Step 8: Create Datasets
# -----------------------------
class HybridFeatureDataset(Dataset):
    """
    Custom PyTorch Dataset for hybrid feature integration.
    Concatenates user ALS embeddings, strain ALS embeddings, and strain CBF embeddings.
    """
    def __init__(self, 
                 interaction_df: pd.DataFrame, 
                 user_embeddings: np.ndarray, 
                 strain_embeddings: np.ndarray, 
                 strain_embeddings_cbf: np.ndarray, 
                 logger: logging.Logger):
        self.user_ids = interaction_df['user_id_code'].values
        self.strain_ids = interaction_df['strain_id_code'].values
        self.ratings = interaction_df['rating'].values.astype(np.float32)
        self.user_embeddings = user_embeddings
        self.strain_embeddings = strain_embeddings
        self.strain_embeddings_cbf = strain_embeddings_cbf
        self.logger = logger

    def __len__(self) -> int:
        return len(self.user_ids)

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, float, int, int]:
        user_id = self.user_ids[idx]
        strain_id = self.strain_ids[idx]
        
        try:
            user_emb = self.user_embeddings[user_id]
            strain_emb = self.strain_embeddings[strain_id]
            strain_emb_cbf = self.strain_embeddings_cbf[strain_id]
        except IndexError as e:
            self.logger.error(f"IndexError at idx {idx}: {e}")
            raise

        # Handle NaNs
        if np.isnan(user_emb).any() or np.isnan(strain_emb).any() or np.isnan(strain_emb_cbf).any():
            self.logger.warning(f"NaN found in features at index {idx}. Replacing NaNs with zeros.")
            user_emb = np.nan_to_num(user_emb)
            strain_emb = np.nan_to_num(strain_emb)
            strain_emb_cbf = np.nan_to_num(strain_emb_cbf)
        
        # Concatenate features
        hybrid_feature = np.concatenate([user_emb, strain_emb, strain_emb_cbf]).astype(np.float32)
        rating = self.ratings[idx]
        
        return hybrid_feature, rating, user_id, strain_id

def create_datasets(train_df: pd.DataFrame, 
                    val_df: pd.DataFrame, 
                    user_embeddings: np.ndarray, 
                    strain_embeddings: np.ndarray, 
                    strain_embeddings_cbf: np.ndarray, 
                    logger: logging.Logger) -> Tuple[Dataset, Dataset]:
    """
    Create training and validation datasets.
    """
    logger.info("Creating training and validation datasets...")
    try:
        train_dataset = HybridFeatureDataset(
            interaction_df=train_df, 
            user_embeddings=user_embeddings, 
            strain_embeddings=strain_embeddings, 
            strain_embeddings_cbf=strain_embeddings_cbf, 
            logger=logger
        )
        val_dataset = HybridFeatureDataset(
            interaction_df=val_df, 
            user_embeddings=user_embeddings, 
            strain_embeddings=strain_embeddings, 
            strain_embeddings_cbf=strain_embeddings_cbf, 
            logger=logger
        )
    except Exception as e:
        logger.error(f"Error creating datasets: {e}")
        sys.exit(1)

    logger.info("Datasets created successfully.")
    return train_dataset, val_dataset

train_dataset, val_dataset = create_datasets(
    train_df, 
    val_df, 
    user_embeddings_reduced, 
    strain_embeddings_reduced, 
    strain_embeddings_cbf_reduced, 
    logger
)

# -----------------------------
# Step 9: Create DataLoaders
# -----------------------------
def create_dataloaders(train_dataset: Dataset, 
                       val_dataset: Dataset, 
                       batch_size: int, 
                       num_workers: int, 
                       logger: logging.Logger) -> Tuple[DataLoader, DataLoader]:
    """
    Create DataLoaders for training and validation datasets.
    """
    logger.info("Creating DataLoaders...")
    try:
        train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True, 
            num_workers=num_workers,
            pin_memory=True  # Enabled pin_memory for GPU
        )
        val_loader = DataLoader(
            val_dataset, 
            batch_size=batch_size, 
            shuffle=False, 
            num_workers=num_workers,
            pin_memory=True  # Enabled pin_memory for GPU
        )
    except Exception as e:
        logger.error(f"Error creating DataLoaders: {e}")
        sys.exit(1)

    logger.info("DataLoaders created successfully.")
    return train_loader, val_loader

train_loader, val_loader = create_dataloaders(
    train_dataset, 
    val_dataset, 
    batch_size=Config.BATCH_SIZE, 
    num_workers=Config.NUM_WORKERS, 
    logger=logger
)

# -----------------------------
# Step 10: Define Deep Hybrid Model
# -----------------------------
class DeepHybridRecommender(nn.Module):
    """
    Neural network architecture for hybrid recommendations.
    """
    def __init__(self, input_size: int):
        super(DeepHybridRecommender, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, 1)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)

# -----------------------------
# Step 11: Enhanced Training and Evaluation
# -----------------------------
def precision_recall_at_k_per_user(y_true: Dict[int, Set[int]], 
                                   y_pred: Dict[int, List[int]], 
                                   k: int) -> Tuple[float, float]:
    """
    Calculate Precision@k and Recall@k per user and return the average.
    """
    precisions = []
    recalls = []
    
    for user, pred_items in y_pred.items():
        true_items = y_true.get(user, set())
        pred_items = pred_items[:k]
        
        if not true_items:
            continue  # Skip users with no relevant items
        
        relevant_pred = len(set(pred_items) & true_items)
        precision = relevant_pred / k
        recall = relevant_pred / len(true_items)
        
        precisions.append(precision)
        recalls.append(recall)
    
    avg_precision = np.mean(precisions) if precisions else 0
    avg_recall = np.mean(recalls) if recalls else 0
    
    return avg_precision, avg_recall

def evaluate_model(model: nn.Module, 
                   loader: DataLoader, 
                   device: torch.device, 
                   k: int = 10) -> Tuple[float, float, float, float, float, float]:
    """
    Evaluate the model on the given data loader and return evaluation metrics.
    Computes RMSE, MAE, NDCG@k, Precision@k, and Recall@k.
    """
    model.eval()
    y_true, y_pred = [], []
    user_true, user_pred = {}, {}
    
    with torch.no_grad():
        for features, ratings, user_ids, strain_ids in tqdm(loader, desc="Evaluating"):
            features = features.to(device)
            predictions = model(features).cpu().numpy().flatten()
            ratings = ratings.cpu().numpy().flatten()
            user_ids = user_ids.numpy().flatten()
            strain_ids = strain_ids.numpy().flatten()
            
            y_true.extend(ratings)
            y_pred.extend(predictions)
            
            for user, strain, rating, pred in zip(user_ids, strain_ids, ratings, predictions):
                if user not in user_true:
                    user_true[user] = set()
                if rating > 3:  # Define relevant items (threshold can be adjusted)
                    user_true[user].add(strain)
                
                if user not in user_pred:
                    user_pred[user] = []
                user_pred[user].append((strain, pred))
    
    # Compute RMSE and MAE on continuous predictions
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    
    # Compute NDCG@k
    # For NDCG, we need relevance scores. Assuming binary relevance here.
    # This might not be the most accurate, but aligns with the thresholding.
    if len(y_true) > 0:
        # Reshape y_true and y_pred for ndcg_score
        # Note: ndcg_score expects 2D arrays
        y_true_matrix = np.array([[1 if rating > 3 else 0 for rating in y_true]])
        y_pred_matrix = np.array([y_pred])
        ndcg = ndcg_score(y_true_matrix, y_pred_matrix, k=k)
    else:
        ndcg = 0
    
    # For ranking metrics
    # For each user, sort the predictions and get top-k
    for user in user_pred:
        sorted_strains = sorted(user_pred[user], key=lambda x: x[1], reverse=True)
        # Ensure exactly k items by padding with dummy items
        if len(sorted_strains) < k:
            sorted_strains += [(0, 0.0)] * (k - len(sorted_strains))
        user_pred[user] = [item for item, score in sorted_strains[:k]]
    
    # Calculate per-user precision and recall
    precision, recall = precision_recall_at_k_per_user(user_true, user_pred, k)
    
    return mse, rmse, mae, ndcg, precision, recall

def train_model(train_loader: DataLoader, 
                val_loader: DataLoader, 
                input_size: int, 
                epochs: int = 12, 
                lr: float = 0.0005, 
                k: int = 10, 
                patience: int = 4) -> nn.Module:
    """
    Train the hybrid recommender model.
    Implements early stopping and learning rate scheduling.
    """
    # Detect ROCm-supported GPU
    if torch.cuda.is_available():
        device = torch.device('cuda')  # ROCm uses 'cuda' as device identifier
        logger.info(f"Using device: {device}")
    else:
        device = torch.device('cpu')
        logger.info(f"Using device: {device}")
    
    model = DeepHybridRecommender(input_size=input_size).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

    best_val_loss = float('inf')
    best_model_path = Config.BEST_MODEL_PATH
    early_stop_counter = 0

    # Initialize histories
    train_losses = []
    val_losses = []
    precision_history = []
    recall_history = []

    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        logger.info(f"Starting Epoch {epoch}/{epochs}")
        for batch_idx, (features, ratings, _, _) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}")):
            features = features.to(device)
            ratings = ratings.to(device).unsqueeze(1)
            
            optimizer.zero_grad()
            predictions = model(features)
            loss = criterion(predictions, ratings)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        logger.info(f"Epoch {epoch} Training Loss: {avg_train_loss:.4f}")
        
        # Evaluation
        mse, rmse, mae, ndcg, precision, recall = evaluate_model(model, val_loader, device, k=k)
        val_losses.append(rmse)
        precision_history.append(precision)
        recall_history.append(recall)
        
        logger.info(f"Epoch {epoch} Validation RMSE: {rmse:.4f}, MAE: {mae:.4f}, NDCG@{k}: {ndcg:.4f}, "
                    f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}")
        
        # Scheduler step
        scheduler.step(rmse)
        
        # Check for improvement
        if rmse < best_val_loss:
            best_val_loss = rmse
            torch.save(model.state_dict(), best_model_path)
            logger.info(f"Epoch {epoch}: New best model saved.")
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            logger.info(f"Epoch {epoch}: No improvement in validation RMSE.")
        
        # Early stopping
        if early_stop_counter >= patience:
            logger.info(f"Early stopping triggered after {epoch} epochs.")
            break

    # Plot training and validation loss
    plot_loss(train_losses, val_losses)
    # Plot precision and recall over epochs
    plot_metric_over_epochs(precision_history, "Precision@10")
    plot_metric_over_epochs(recall_history, "Recall@10")
    
    return model

def plot_loss(train_losses: List[float], val_losses: List[float]):
    """
    Plot training and validation loss over epochs.
    """
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation RMSE')
    plt.legend()
    plt.xlabel("Epochs")
    plt.ylabel("Loss / RMSE")
    plt.title("Training Loss and Validation RMSE")
    plt.grid(True)
    plt.savefig(Config.LOSS_PLOT_PATH)
    plt.close()
    logger.info(f"Loss plot saved as '{Config.LOSS_PLOT_PATH}'.")

def plot_metric_over_epochs(metrics: List[float], metric_name: str):
    """
    Plot a given metric over epochs.
    """
    plt.figure(figsize=(8, 5))
    plt.plot(metrics, label=metric_name)
    plt.xlabel("Epochs")
    plt.ylabel(metric_name)
    plt.title(f"{metric_name} Over Epochs")
    plt.grid(True)
    plt.legend()
    metric_plot_path = f'{metric_name.replace("@", "").replace(" ", "_").lower()}_plot.png'
    plt.savefig(os.path.join('.venv', metric_plot_path))
    plt.close()
    logger.info(f"{metric_name} plot saved as '{os.path.join('.venv', metric_plot_path)}'.")

# -----------------------------
# Step 12: Compute SHAP Values
# -----------------------------
def compute_shap_values(model: nn.Module, 
                        sample_features: torch.Tensor, 
                        logger: logging.Logger, 
                        config: Config):
    """
    Compute SHAP values for the model.
    """
    try:
        logger.info("Computing SHAP values...")
        model_cpu = DeepHybridRecommender(input_size=sample_features.shape[1]).to('cpu')
        model_cpu.load_state_dict(model.state_dict())
        model_cpu.eval()
        
        sample_features_cpu = sample_features.to('cpu')  # Ensure features are on CPU
        
        # Initialize SHAP DeepExplainer
        explainer = shap.DeepExplainer(model_cpu, sample_features_cpu)
        shap_values = explainer.shap_values(sample_features_cpu)
        
        # Generate SHAP summary plot
        shap.summary_plot(shap_values, sample_features_cpu.numpy(), show=False)
        plt.savefig(config.SHAP_PLOT_PATH)
        plt.close()
        logger.info(f"SHAP summary plot saved as '{config.SHAP_PLOT_PATH}'.")
    except Exception as e:
        logger.error(f"SHAP explainability failed: {e}")

# -----------------------------
# Step 13: Real-Time Recommendation with Faiss
# -----------------------------
def build_faiss_index(strain_embeddings: np.ndarray, 
                      config: Config, 
                      logger: logging.Logger) -> faiss.Index:
    """
    Build a Faiss index for strain embeddings using specified index type.
    """
    try:
        index_type = config.FAISS_INDEX
        d = strain_embeddings.shape[1]
        logger.info(f"Building Faiss index of type {index_type} with dimension {d}.")
        
        if index_type == 'Flat':
            index = faiss.IndexFlatL2(d)
        elif index_type == 'IVFFlat':
            nlist = 100  # Number of clusters, can be tuned
            quantizer = faiss.IndexFlatL2(d)
            index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
            # Train the index
            logger.info("Training Faiss index...")
            index.train(strain_embeddings.astype(np.float32))
        elif index_type == 'IVFPQ':
            nlist = 100
            m = 16  # Number of subquantizers
            nbits = 8  # Number of bits per subquantizer
            quantizer = faiss.IndexFlatL2(d)
            index = faiss.IndexIVFPQ(quantizer, d, nlist, m, nbits)
            # Train the index
            logger.info("Training Faiss index...")
            index.train(strain_embeddings.astype(np.float32))
        else:
            logger.error(f"Unsupported Faiss index type: {index_type}")
            sys.exit(1)
        
        # Add vectors to the index
        index.add(strain_embeddings.astype(np.float32))
        logger.info(f"Faiss index built with {index.ntotal} vectors.")
        return index
    except Exception as e:
        logger.error(f"Error building Faiss index: {e}")
        sys.exit(1)

def real_time_recommendation(query_embedding: np.ndarray, 
                             faiss_index: faiss.Index, 
                             strain_data: pd.DataFrame, 
                             top_k: int = 10, 
                             logger: logging.Logger = None) -> List[str]:
    """
    Retrieve top-k similar strains using Faiss and return strain names.
    """
    try:
        D, I = faiss_index.search(query_embedding.astype(np.float32), top_k)
        
        # Convert indices to strain names
        recommended_strains = []
        for idx in I[0]:
            if idx < len(strain_data):
                strain_name = strain_data.iloc[idx]['Strain_Name']
                recommended_strains.append(strain_name)
            else:
                recommended_strains.append("Unknown Strain")
        
        if logger:
            logger.info(f"Recommended strains: {recommended_strains}")
        
        return recommended_strains
    except Exception as e:
        if logger:
            logger.error(f"Error during real-time recommendation: {e}")
        return []

# -----------------------------
# Step 14: Diversity and Serendipity Evaluation
# -----------------------------
def calculate_diversity(recommendations: List[str], 
                        strain_embeddings: np.ndarray, 
                        strain_data: pd.DataFrame, 
                        logger: logging.Logger) -> float:
    """
    Measure diversity of recommendations using pairwise cosine similarity.
    Diversity is defined as 1 - average cosine similarity between recommended items.
    """
    if len(recommendations) < 2:
        logger.warning("Not enough recommendations to calculate diversity.")
        return 0.0  # No diversity if less than 2 items
    
    # Convert strain names back to indices
    recommended_indices = []
    for name in recommendations:
        idx = strain_data[strain_data['Strain_Name'] == name].index
        if not idx.empty:
            recommended_indices.append(idx[0])
        else:
            logger.warning(f"Strain name '{name}' not found in strain_data.")
    
    if len(recommended_indices) < 2:
        logger.warning("Not enough valid recommendations to calculate diversity.")
        return 0.0
    
    try:
        pairwise_sim = cosine_similarity(strain_embeddings[recommended_indices])
        triu_indices = np.triu_indices_from(pairwise_sim, k=1)
        avg_cosine_similarity = np.mean(pairwise_sim[triu_indices])
        avg_diversity = 1 - avg_cosine_similarity  # Ensure diversity does not exceed 1
        # Clamp the value between 0 and 1
        avg_diversity = np.clip(avg_diversity, 0, 1)
    except Exception as e:
        logger.error(f"Error calculating diversity: {e}")
        return 0.0
    
    return avg_diversity

def calculate_serendipity(recommendations: List[str], 
                          true_likes: Set[int], 
                          strain_data: pd.DataFrame, 
                          logger: logging.Logger) -> float:
    """
    Evaluate serendipity by checking how often unexpected items are relevant.
    Serendipity is defined as the proportion of recommendations that are unexpected.
    """
    # Define 'unexpected' as items not interacted with by the user
    # Assuming true_likes are the items the user has interacted with (rating > 3)
    # Here, 'unexpected but relevant' could mean items that the user hasn't interacted with but are actually relevant
    # However, in this context, relevance is defined as rating >3, which would only include true_likes
    # Thus, serendipity might require a different definition or additional data
    # For demonstration, we'll consider it as the proportion of recommendations that are unexpected
    
    # Get all recommended strain codes
    recommended_strain_codes = []
    for name in recommendations:
        # Assuming 'strain_id' exists in strain_data
        strain_ids = strain_data[strain_data['Strain_Name'] == name]['strain_id'].values
        if len(strain_ids) > 0:
            strain_code = strain_mapping.get(strain_ids[0], None)
            if strain_code is not None:
                recommended_strain_codes.append(strain_code)
            else:
                logger.warning(f"Strain code for strain_id {strain_ids[0]} not found in strain_mapping.")
        else:
            logger.warning(f"Strain name '{name}' not found in strain_data.")
    
    unexpected_recommended = set(recommended_strain_codes) - true_likes
    # Define serendipity as the proportion of recommendations that are unexpected
    serendipity = len(unexpected_recommended) / len(recommendations) if recommendations else 0
    return serendipity

# -----------------------------
# Step 15: Main Execution
# -----------------------------
def main():
    """
    Main execution function to train and evaluate the model.
    Also handles visualization and recommendation evaluation.
    """
    # Ensure input size matches the concatenated embeddings
    input_size = user_embeddings_reduced.shape[1] + strain_embeddings_reduced.shape[1] + strain_embeddings_cbf_reduced.shape[1]
    logger.info(f"Input size for the model: {input_size}")

    # Train the model
    model = train_model(train_loader, val_loader, input_size)

    # Load the best model
    if os.path.exists(Config.BEST_MODEL_PATH):
        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        model.load_state_dict(torch.load(Config.BEST_MODEL_PATH, map_location=device))
        logger.info("Loaded the best model from disk.")
    else:
        logger.warning("Best model path not found. Using the current model.")

    # SHAP for Explainability
    try:
        logger.info("Computing SHAP values for model explainability...")
        model.eval()
        # Select a subset of the training data for SHAP
        sample_features, _, _, _ = next(iter(train_loader))
        sample_features = sample_features[:10].to('cpu')  # Ensure on CPU for SHAP compatibility
        compute_shap_values(model, sample_features, logger, Config)
    except Exception as e:
        logger.error(f"SHAP explainability failed: {e}")

    # Build Faiss Index
    try:
        faiss_index = build_faiss_index(
            strain_embeddings_reduced, 
            Config, 
            logger
        )
    except Exception as e:
        logger.error(f"Faiss index building failed: {e}")
        faiss_index = None

    # Real-Time Recommendation
    try:
        if faiss_index is not None:
            logger.info("Performing real-time recommendation using Faiss...")
            # Select a random user from the validation set
            random_user = val_df.sample(1)['user_id_code'].values[0]
            logger.info(f"Selected random user_id_code: {random_user}")

            # Get the user's interactions
            user_interactions = train_df[train_df['user_id_code'] == random_user]['strain_id_code'].values
            if len(user_interactions) == 0:
                logger.warning(f"No interactions found for user {random_user}. Using the user's average embedding.")
                if random_user < len(user_embeddings_reduced):
                    query_embedding = user_embeddings_reduced[random_user].reshape(1, -1).astype(np.float32)
                else:
                    logger.error(f"User code {random_user} is out of bounds for user_embeddings_reduced.")
                    query_embedding = np.zeros((1, user_embeddings_reduced.shape[1]), dtype=np.float32)
            else:
                # Compute the average embedding for the user based on interacted strains
                user_emb_als = strain_embeddings_reduced[user_interactions].mean(axis=0)  # Using strain embeddings for consistency
                user_emb = user_emb_als.reshape(1, -1).astype(np.float32)
                query_embedding = user_emb  # 128-dimensional to match Faiss index

            recommended_strains = real_time_recommendation(
                query_embedding=query_embedding, 
                faiss_index=faiss_index, 
                strain_data=strain_data, 
                top_k=Config.K, 
                logger=logger
            )

            # Log the recommended strain names
            logger.info(f"Recommended strains for user {random_user}: {recommended_strains}")
        else:
            logger.error("Faiss index is not available. Skipping real-time recommendation.")
    except Exception as e:
        logger.error(f"Real-time recommendation failed: {e}")

    # Evaluate Diversity and Serendipity
    try:
        if faiss_index is not None and len(recommended_strains) > 0:
            logger.info("Evaluating diversity and serendipity of recommendations...")
            diversity_score = calculate_diversity(
                recommendations=recommended_strains, 
                strain_embeddings=strain_embeddings_reduced, 
                strain_data=strain_data, 
                logger=logger
            )
            # Get true likes for the user
            true_likes = set(train_df[train_df['user_id_code'] == random_user]['strain_id_code'].unique())
            serendipity_score = calculate_serendipity(
                recommendations=recommended_strains, 
                true_likes=true_likes, 
                strain_data=strain_data, 
                logger=logger
            )
            logger.info(f"Diversity Score: {diversity_score:.4f}")
            logger.info(f"Serendipity Score: {serendipity_score:.4f}")
        else:
            logger.warning("Cannot evaluate diversity and serendipity without recommendations.")
    except Exception as e:
        logger.error(f"Diversity and serendipity evaluation failed: {e}")

    logger.info("Training and evaluation completed successfully.")

if __name__ == "__main__":
    main()
