# 02 - Model Training
This notebook focuses on training multiple ML models (Gradient Boosting, LightGBM, Random Forest, etc.).

In [None]:
#  Define Features (X) and Target (y) ---
# Drop original string columns that have been encoded and the original target
X = df.drop(columns=['Potential Label', 'Potential Label_Encoded', 'Engagement Level', 'Urgency Level'])
y = df['Potential Label_Encoded'].astype(int) # Ensure target is integer type for classification

print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

# --- 4. Perform Manual Cross-Validation ---

from sklearn.ensemble import RandomForestClassifier # Import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score # Import StratifiedKFold and cross_val_score

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Set up Stratified K-Fold Cross-Validation
# StratifiedKFold is crucial for classification to ensure each fold has
# approximately the same percentage of samples of each target class.
n_splits = 10 # Common choices are 3, 5, or 10 folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation
# `cross_val_score` trains and evaluates the model on each fold
accuracy_scores = cross_val_score(rf_model, X, y, cv=skf, scoring='accuracy')

print(f"\n--- Cross-Validation Results ({n_splits}-Fold) ---")
print(f"Accuracy scores for each of the {n_splits} folds:")
for i, score in enumerate(accuracy_scores):
    print(f"Fold {i+1}: {score:.4f}")

print(f"\nMean accuracy across {n_splits} folds: {accuracy_scores.mean():.4f}")
print(f"Standard deviation of accuracy across {n_splits} folds: {accuracy_scores.std():.4f}")
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression # Corrected import
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# --- 3. Define Features (X) and Target (y) ---
# Drop original string columns that have been encoded and the original target
X = df.drop(columns=['Potential Label', 'Potential Label_Encoded', 'Engagement Level', 'Urgency Level'])
y = df['Potential Label_Encoded'].astype(int) # Ensure target is integer type for classification

print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

# --- 4. Set up Stratified K-Fold Cross-Validation ---
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print(f"\n--- Cross-Validation Results ({n_splits}-Fold) for Different Algorithms ---")

# --- Logistic Regression ---
print("\n--- Logistic Regression ---")
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
accuracy_scores_lr = cross_val_score(log_reg_model, X, y, cv=skf, scoring='accuracy')
print(f"Accuracy scores for each of the {n_splits} folds:")
for i, score in enumerate(accuracy_scores_lr):
    print(f"Fold {i+1}: {score:.4f}")
print(f"Mean Accuracy: {accuracy_scores_lr.mean():.4f}")
print(f"Std Dev Accuracy: {accuracy_scores_lr.std():.4f}")

# --- Support Vector Classifier (SVC) ---
print("\n--- Support Vector Classifier (SVC) ---")
svc_model = SVC(random_state=42)
accuracy_scores_svc = cross_val_score(svc_model, X, y, cv=skf, scoring='accuracy')
print(f"Accuracy scores for each of the {n_splits} folds:")
for i, score in enumerate(accuracy_scores_svc):
    print(f"Fold {i+1}: {score:.4f}")
print(f"Mean Accuracy: {accuracy_scores_svc.mean():.4f}")
print(f"Std Dev Accuracy: {accuracy_scores_svc.std():.4f}")

# --- K-Nearest Neighbors Classifier ---
print("\n--- K-Nearest Neighbors Classifier ---")
knn_model = KNeighborsClassifier(n_neighbors=5) # Default n_neighbors, can be tuned
accuracy_scores_knn = cross_val_score(knn_model, X, y, cv=skf, scoring='accuracy')
print(f"Accuracy scores for each of the {n_splits} folds:")
for i, score in enumerate(accuracy_scores_knn):
    print(f"Fold {i+1}: {score:.4f}")
print(f"Mean Accuracy: {accuracy_scores_knn.mean():.4f}")
print(f"Std Dev Accuracy: {accuracy_scores_knn.std():.4f}")

# --- Decision Tree Classifier ---
print("\n--- Decision Tree Classifier ---")
dt_model = DecisionTreeClassifier(random_state=42)
accuracy_scores_dt = cross_val_score(dt_model, X, y, cv=skf, scoring='accuracy')
print(f"Accuracy scores for each of the {n_splits} folds:")
for i, score in enumerate(accuracy_scores_dt):
    print(f"Fold {i+1}: {score:.4f}")
print(f"Mean Accuracy: {accuracy_scores_dt.mean():.4f}")
print(f"Std Dev Accuracy: {accuracy_scores_dt.std():.4f}")

# --- Gaussian Naive Bayes ---
print("\n--- Gaussian Naive Bayes ---")
gnb_model = GaussianNB()
accuracy_scores_gnb = cross_val_score(gnb_model, X, y, cv=skf, scoring='accuracy')
print(f"Accuracy scores for each of the {n_splits} folds:")
for i, score in enumerate(accuracy_scores_gnb):
    print(f"Fold {i+1}: {score:.4f}")
print(f"Mean Accuracy: {accuracy_scores_gnb.mean():.4f}")
print(f"Std Dev Accuracy: {accuracy_scores_gnb.std():.4f}")
import joblib

In [None]:
# Step 4: Save models
# joblib.dump(model, '../models/gradient_boosting_model.pkl')