## **Phase 4: Model Training and Evaluation**

In [1]:
# importing our librarbies
import pandas as pd
import numpy as np
 

In [2]:
df = pd.read_csv('preprocessed_easy_visa.csv')

#### **Data Splitting**

we will use stratified splitting based on the class imbalance of our targrt data which is the case status

In [3]:
# importing selected features from  preprocessing 

selected_features = ['company_age', 'wage_per_year', 'wage_per_employee_ratio', 'employees_growth_rate_ratio', 'wage_per_age_ratio', 'case_status_encoded', 'education_level_ordinal', 'establishment_period_ordinal', 'continent_Africa', 'continent_Asia', 'continent_Europe', 'continent_North America', 'continent_Oceania', 'continent_South America', 'region_target_encoded', 'has_job_experience_encoded', 'requires_job_training_encoded', 'full_time_position_encoded', 'prevailing_wage_log', 'no_of_employees_log']



In [4]:
# preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, confusion_matrix

In [5]:

X= df.drop('case_status_encoded', axis=1)
y = df['case_status_encoded']


# Stratified data splitting based on EDA findings about class imbalance
print("=== STRATIFIED DATA SPLITTING ===")
print("EDA identified class imbalance - using stratified splitting to preserve class distribution")

# Select the chosen features
X_selected = X[selected_features]
print(f"Selected features shape: {X_selected.shape}")

# First split: 70% train+val, 30% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42, stratify=y
)

# Second split: 75% train, 25% validation (of the 80%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"\nData split results:")
print(f"Training set: {X_train.shape} ({(X_train.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Validation set: {X_val.shape} ({(X_val.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Test set: {X_test.shape} ({(X_test.shape[0]/len(X_selected))*100:.1f}%)")

# Check class distribution in each set (should be similar due to stratification)
print(f"\nClass distribution verification:")
print("Training set quality distribution:")
print(y_train.value_counts().sort_index())
print("\nValidation set quality distribution:")
print(y_val.value_counts().sort_index())
print("\nTest set quality distribution:")
print(y_test.value_counts().sort_index())

=== STRATIFIED DATA SPLITTING ===
EDA identified class imbalance - using stratified splitting to preserve class distribution


KeyError: "['case_status_encoded'] not in index"

#### **Feature scaling**

In [None]:
# Using standard scaler as the data has little to no outliers based on the EDA recommendation
print("=== FEATURE SCALING (STANDARD SCALER) ===")
print("EDA recommended StandardScaler for distance-based models")
# Fit scaler on training data only (to avoid data leakage)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
# Transform validation and test sets
X_val_scaled = scaler.transform(X_val)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
print("✓ Scaling applied successfully!")
print(f"Training set scaled - Mean: {X_train_scaled.mean().mean():.4f}, Std: {X_train_scaled.std().mean():.4f}")
print(f"Validation set scaled - Mean: {X_val_scaled.mean().mean():.4f}, Std: {X_val_scaled.std().mean():.4f}")
print(f"Test set scaled - Mean: {X_test_scaled.mean().mean():.4f}, Std: {X_test_scaled.std().mean():.4f}")
# Verify scaling worked correctly
print(f"\nScaling verification:")
print(f"Training set - Mean ≈ 0: {abs(X_train_scaled.mean().mean()) < 0.01}")
print(f"Training set - Std ≈ 1: {abs(X_train_scaled.std().mean() - 1) < 0.01}")
print(f"Validation set - Mean ≈ 0: {abs(X_val_scaled.mean().mean()) < 0.01}")
print(f"Validation set - Std ≈ 1: {abs(X_val_scaled.std().mean() - 1) < 0.01}")
print(f"Test set - Mean ≈ 0: {abs(X_test_scaled.mean().mean()) < 0.01}")
print(f"Test set - Std ≈ 1: {abs(X_test_scaled.std().mean() - 1) < 0.01}")



=== FEATURE SCALING (STANDARD SCALER) ===
EDA recommended StandardScaler for distance-based models
✓ Scaling applied successfully!
Training set scaled - Mean: -0.0000, Std: 1.0000
Validation set scaled - Mean: -0.0004, Std: 1.0142
Test set scaled - Mean: 0.0005, Std: 1.0045

Scaling verification:
Training set - Mean ≈ 0: True
Training set - Std ≈ 1: True
Validation set - Mean ≈ 0: True
Validation set - Std ≈ 1: False
Test set - Mean ≈ 0: True
Test set - Std ≈ 1: True


### **Algorithm selection**

Task 4.2: Choose and justify the selection of machine learning algorithms (e.g., Linear Regression, Decision Tree, Random Forest, Gradient Boosting).

Based on the EDA insights and recommendations, we would go with:
- Gradient Boosting Classifier, XG Boost Classifier and LightGBM:
    - *Dataset*: Our data is mostly categorical including the target and it is a large dataset containing above 10,000 samples.
    - *Performance*: Gradient Boosting and XGBoost is best for tabular data like risk prediction and credit scoring.

### **Model Comparison and Evaluation**

In [None]:
# Create GradientBoost Model
gb_clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)
# Train the model
gb_clf.fit(X_train, y_train)
# Make predictions
y_train_pred_gb = gb_clf.predict(X_train)
y_test_pred_gb = gb_clf.predict(X_test)
# Evaluate performance
train_accuracy_gb = accuracy_score(y_train, y_train_pred_gb)
test_accuracy_gb = accuracy_score(y_test, y_test_pred_gb)
train_balanced_acc_gb = balanced_accuracy_score(y_train, y_train_pred_gb)
test_balanced_acc_gb = balanced_accuracy_score(y_test, y_test_pred_gb)
train_f1_gb = f1_score(y_train, y_train_pred_gb, average='macro')
test_f1_gb = f1_score(y_test, y_test_pred_gb, average='macro')
print(f"\nXGBoost Performance:")
print(f"Test - Accuracy: {test_accuracy_gb:.3f}, Balanced Acc: {test_balanced_acc_gb:.3f}, Macro F1: {test_f1_gb:.3f}")
# Feature importance analysis
print(f"\nFeature Importance (Top 10):")
feature_importance_gb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': gb_clf.feature_importances_
}).sort_values('importance', ascending=False)
for i, (_, row) in enumerate(feature_importance_gb.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']}: {row['importance']:.3f}")
# Store XGBoost results
xgb_results = {
    'model': 'GradientBoosting',
    'train_accuracy': train_accuracy_gb,
    'test_accuracy': test_accuracy_gb,
    'train_balanced_acc': train_balanced_acc_gb,
    'test_balanced_acc': test_balanced_acc_gb,
    'train_f1': train_f1_gb,
    'test_f1': test_f1_gb
}
print("GradientBoost model completed!")







