### 1. Classification

Common scenarios include:
* Fraud detection (fraud vs. not fraud)
* Spam detection (spam vs. not spam)
* Churn prediction (will churn vs. won't churn)
* Product category classification (which department it belongs to)
* Sentiment analysis (positive vs. negative, or multiple sentiment labels)

#### 1.1 Logistic Regression (Binary Classification)

* Data drift: fraudster evolve their strategies and user preferences or competitor landscapes may shift. Retrain models reguarly, monitor key metrics (e.g. drop in recall)
* Class imbalance: fraud datasets are imbalanced (fraud is rare). Use oversampling (SMOTE) or `class_weight="balanced"`
* Prioritize recall to catch as much fraud as possible

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Example dataset with columns ['amount', 'merchant_category', 'time_of_day', 'label']
# label = 1 if fraud, 0 otherwise

df = pd.read_csv("dataset.csv")
X = df.drop('label', axis=1)
y = df['label']

# One-hot encode categorical features
X = pd.get_dummies(X, columns=['merchant_category', 'time_of_day'], drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=777)

# Train Logistic Regression
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")

#### 1.2 XGBoost (Classification)

* XGBoost can handle missing values internally
* If real-time predictions are needed, inference service must be low-latency

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Sample data
df = pd.read_csv("data.csv")
X = df.drop("churn_label", axis=1)
y = df["churn_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)

clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=1,
    use_label_encoder=False,
    eval_metric='logloss'
)
clf.fit(X_train, y_train)

y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {auc:.4f}")

#### 1.3 Neural Network (Classification)
* Needs larger dataset (risk overfitting with small datasets)
* Prevent overfitting with dropout, batch normalization, early stopping

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Sample data
df = pd.read_csv("data.csv")
X = df.drop("label", axis=1).values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2):
        super(SimpleMLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.network(x)

model = SimpleMLP(input_dim=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    # Evaluate on train each epoch
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_train_tensor).float().mean().item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Train Acc: {accuracy:.4f}")

# Test
model.eval()
test_outputs = model(X_test_tensor)
_, y_pred = torch.max(test_outputs, 1)
test_accuracy = (y_pred == y_test_tensor).float().mean().item()
print(f"Test Accuracy: {test_accuracy:.4f}")

### 2. Regression

Common scenarios to predict a continous numeric value include:
* Sales forecasting (predict next month's revenue)
* Price predictions (product pricing)
* Customer lifetime value (historic + predictive)

#### 2.1 Linear Regression

* Non-linearity: linear regression assumes a linear relationship. 
* Highly correlated features can skew coefficnets (remove them)
* If data has a time componenet, time series methods be better
* Relatively interpretable coefficients

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Sample data
df = pd.read_csv("data.csv")
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}, R^2: {r2:.2f}")

#### 2.2 XGBoost (Regression)

* Has built-in feature importance
* More robust, and can handle non-linear relationships
* Need to tune hyperparameters

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

df = pd.read_csv("data.csv")
X = df.drop("sales", axis=1)
y = df["sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg = xgb.XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.2f}")

#### 2.3 Neural Network (Regression)

* Preprocess data by scaling/normalizing
* Prevent overfitting with early stopping, dropout, or regularization (especially for small datasets)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_csv("regression_data.csv")
X = df.drop("target", axis=1).values
y = df["target"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

X_train_t = torch.tensor(X_train_sc, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

X_test_t = torch.tensor(X_test_sc, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

class RegressionNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(RegressionNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    def forward(self, x):
        return self.net(x)

model = RegressionNN(input_dim=X_train.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    pred = model(X_train_t)
    loss = criterion(pred, y_train_t)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    y_pred_t = model(X_test_t)
mse = mean_squared_error(y_test, y_pred_t.numpy())
print(f"MSE on test: {mse:.4f}")

### 3. Recommendation Systems

Common scenarios include:
* Product recommendations (Amazon, Shopify)
* Content recommendations (YouTube, Netflix)

#### 3.1 Matrix Factorization (Alternating Least Squares)

* With many users and many products, data sparsity means a sparse user-item matrix
* New users or new items (cold start) might require MAB
* Some systems precompute top-N recommendaitons nightly, others do on-the-fly
* For large catalogs, factorization-based methods are memory-intensive
* Metrics: Precision@k and Recall@k, but mainly A/B test

In [None]:
# !pip install implicit
import scipy.sparse as sparse
import numpy as np
from implicit.als import AlternatingLeastSquares

# Suppose user_item_matrix is (num_users x num_items)
user_item_sparse = sparse.csr_matrix(user_item_matrix)

model = AlternatingLeastSquares(
    factors=50,
    regularization=0.01,
    iterations=15
)
model.fit(user_item_sparse)

# Recommend for a particular user
user_id = 123
recommendations = model.recommend(user_id, user_item_sparse[user_id], N=5)
print("Recommended items:", recommendations)

### 4. Natural Language Processing (NLP)

Common scenarios include: 
* Sentiment analysis (classify text as positive, negative, neutral)
* Topic classification (assign texts to categories)

#### 4.1 Transformer Model (BERT for Classification)
* Languages shift as slang/trends change
* For a specialized domain (medical, legal, financial), consider domain-specific BERT

In [None]:
# !pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

texts = ["I love this product!", "This is terrible.", ...]
labels = [1, 0, ...]  # 1=positive, 0=negative
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

dataset = SentimentDataset(encodings, labels)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    evaluation_strategy="no"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

# Inference
test_texts = ["I hate waiting in line", "Absolutely fantastic!"]
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
outputs = model(**test_encodings)
predictions = torch.argmax(outputs.logits, dim=1)
print("Predictions:", predictions)

### 5. Clustering

Common scenarios include:
* Customer segmentation
* Product grouping
* Image clustering

#### 5.1 K-Means 

* Number of clusters $k$ relies on domain knowledge
* Distance based, so scale/normalize features (StandardScaler)

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Suppose df has numeric features describing customers
df = pd.read_csv("data.csv")  
X = df.values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Choose a k (domain knowledge or elbow method)
k = 5  
kmeans = KMeans(n_clusters=k, random_state=777)
cluster_labels = kmeans.fit_predict(X_scaled)

# Evaluate using Silhouette Score
sil_score = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score for k={k}: {sil_score:.4f}")

# Append cluster labels to your DataFrame
df["cluster"] = cluster_labels

#### 5.2 DBSCAN

* Density-based, finds "core samples" of high density and expands clusters around them
* Better for non-spehrical clusters
* Handles outliers as "noise" points

In [None]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data.csv")
X = df.values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)

# Outliers are labeled as -1
df["dbscan_label"] = dbscan_labels
print(df["dbscan_label"].value_counts())

### 6. Time Series Forecasting

Common scenarios include:
* Basic sales forecasting
* Website traffic

#### 6.1 ARIMA

* Best for stationary data; cant automatically handle seasonality unless we use SARIMA
* Must interpolate missing values

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error

df = pd.read_csv("data.csv")  # columns: date, sales
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

train_size = int(len(df) * 0.8)
train_data = df.iloc[:train_size]
test_data = df.iloc[train_size:]

model = ARIMA(train_data['sales'], order=(2,1,2))
model_fit = model.fit()

forecast = model_fit.forecast(steps=len(test_data))
mae = mean_absolute_error(test_data['sales'], forecast)
print(f"MAE: {mae:.2f}")

#### 6.2 Prophet

* Automatically detects trends, seasonality (daily, weekly, yearly), and holidays
* Sensitive to big spikes, so remove or adjust outliers 

In [None]:
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error

df = pd.read_csv("time_series_data.csv")  # Columns: date, sales
df.rename(columns={"date": "ds", "sales": "y"}, inplace=True)

train_size = int(len(df) * 0.8)
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

model = Prophet(yearly_seasonality=True, weekly_seasonality=True)
model.fit(train_df)

future = model.make_future_dataframe(periods=len(test_df))
forecast = model.predict(future)

# Compare forecast["yhat"] to test_df["y"]
test_forecast = forecast.iloc[train_size:]
mse = mean_squared_error(test_df["y"], test_forecast["yhat"])
print(f"MSE: {mse:.2f}")

#### 6.3 LSTM / RNN

[TODO]