In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import seaborn as sns

dataset1 = pd.read_csv('datasets/IBM Dataset 1.csv')
dataset2_train = pd.read_csv('datasets/dataset_2_train.csv')
dataset2_test = pd.read_csv('datasets/dataset_2_test.csv')

categorical_mappings = {
    "Attrition": {"Yes": 1, "No": 0, "Left": 1, "Stayed": 0},
    "Education Level": {
        "High School": 1, "Associate Degree": 2, "Bachelor’s Degree": 3,
        "Master’s Degree": 4, "PhD": 5
    },
    "Gender": {"Female": 0, "Male": 1},
    "Job Level": {"Entry": 1, "Mid": 3, "Senior": 5},
    "Job Satisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "Marital Status": {"Single": 0, "Married": 1, "Divorced": 2},
    "Performance Rating": {
        "Low": 1, "Below Average": 2, "Average": 3, "High": 4
    },
    "Work-Life Balance": {
        "Poor": 1, "Fair": 2, "Good": 3, "Excellent": 4
    }
}


def align_dtypes(dataset, target_dtypes):

    dataset = dataset.copy()
    for col, dtype in target_dtypes.items():
        if col in dataset.columns:
            if dtype == "int64":
                dataset[col] = pd.to_numeric(dataset[col], errors="coerce").fillna(0).astype(int)
            elif dtype == "float64":
                dataset[col] = pd.to_numeric(dataset[col], errors="coerce").fillna(0.0).astype(float)
            elif dtype == "object":
                dataset[col] = dataset[col].astype("object")
    return dataset

def apply_mappings(dataset, mappings):
   
    dataset = dataset.copy()
    for feature, mapping in mappings.items():
        if feature in dataset.columns:
            dataset[feature] = dataset[feature].map(mapping)
    return dataset

def align_and_preprocess_datasets(dataset1):
    aligned_features = [
        "Age", "Attrition", "Distance from Home", "Education Level", "Gender",
        "Job Level", "Job Satisfaction", "Marital Status", "Monthly Income",
        "Performance Rating", "Work-Life Balance", "Years at Company"
    ]
    column_mapping = {
        "DistanceFromHome": "Distance from Home",
        "Education": "Education Level",
        "JobLevel": "Job Level",
        "JobSatisfaction": "Job Satisfaction",
        "MonthlyIncome": "Monthly Income",
        "PerformanceRating": "Performance Rating",
        "WorkLifeBalance": "Work-Life Balance",
        "YearsAtCompany": "Years at Company",
        "MaritalStatus": "Marital Status"
    }

    dataset1 = dataset1.rename(columns=column_mapping)

    ## make sure both datasets have same dtypes for common features
    target_dtypes = {
        "Age": "int64",
        "Attrition": "object",
        "Distance from Home": "int64",
        "Education Level": "int64",
        "Gender": "object",
        "Job Level": "int64",
        "Job Satisfaction": "int64",
        "Marital Status": "object",
        "Monthly Income": "int64",
        "Performance Rating": "int64",
        "Work-Life Balance": "int64",
        "Years at Company": "int64",
        "IncomePerJobLevel": "float64",
        "YearsRatio": "float64",
        "JobEducationRatio": "float64"
    }
    
    

    dataset1 = apply_mappings(dataset1, categorical_mappings)

    dataset1 = align_dtypes(dataset1, target_dtypes)

    dataset1 = dataset1[aligned_features]

    return dataset1

dataset1_processed = align_and_preprocess_datasets(dataset1)
dataset2_processed = align_and_preprocess_datasets(dataset2_train)
dataset2_test_processed = align_and_preprocess_datasets(dataset2_test)


## Feature Engineering for common features in dataset 1 and 2
def feature_engineering(df):
    df['IncomePerJobLevel'] = df['Monthly Income'] / (df['Job Level'] + 1)
    df['YearsRatio'] = df['Years at Company'] / (df['Age'] + 1)
    df['JobEducationRatio'] = df['Education Level'] / (df['Job Level'] + 1)
    return df

dataset1_processed = feature_engineering(dataset1_processed)
dataset2_processed = feature_engineering(dataset2_processed)
dataset2_test_processed = feature_engineering(dataset2_test_processed)


## Split Dataset 1 into train and holdout test sets
train_1, test_1 = train_test_split(
    dataset1_processed, test_size=0.2, stratify=dataset1_processed['Attrition'], random_state=42
)

X_train_2 = dataset2_processed.drop(columns=['Attrition'])
y_train_2 = dataset2_processed['Attrition']

X_test_2 = dataset2_test_processed.drop(columns=['Attrition'])
y_test_2 = dataset2_test_processed['Attrition']


X_train_1 = train_1.drop(columns=['Attrition'])
y_train_1 = train_1['Attrition']


X_test = test_1.drop(columns=['Attrition'])
y_test = test_1['Attrition']

y_train_1 = y_train_1.astype(int)
y_test = y_test.astype(int)
y_train_2 = y_train_2.astype(int)
y_test_2 = y_test_2.astype(int)


In [2]:
N_TRIALS = 2 

from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, accuracy_score, recall_score, f1_score, precision_score
import optuna
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

def hyperparameter_tuning(X_train, y_train, X_test, y_test, description):
    
    def objective(trial):
        n_estimators = trial.suggest_int("n_estimators", 50, 200)
        max_depth = trial.suggest_int("max_depth", 5, 30)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

        model = RandomForestClassifier(
            random_state=42,
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
        )
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        val_f1_scores = []

        for train_idx, val_idx in skf.split(X_train, y_train):
            X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model.fit(X_train_cv, y_train_cv)
            y_val_pred = model.predict(X_val_cv)
            f1 = f1_score(y_val_cv, y_val_pred, average="weighted")
            val_f1_scores.append(f1)

        return sum(val_f1_scores) / len(val_f1_scores)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=N_TRIALS)

    best_model = RandomForestClassifier(random_state=42, **study.best_params)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba)
    pr_auc = auc(recall_curve, precision_curve)

    return best_model, confusion_matrix(y_test, y_pred), f1_score(y_test, y_pred, average="weighted"), precision_score(y_test, y_pred, average="weighted"), recall_score(y_test, y_pred, average="weighted"), pr_auc, accuracy_score(y_test, y_pred)


model1 =  RandomForestClassifier(
            random_state=42)
model1.fit(X_train_2, y_train_2)
dataset1_pred_labels = model1.predict(X_train_1)
dataset1_pred_test_labels = model1.predict(X_test)

X_train_1['dataset2_label'] = dataset1_pred_labels
X_test['dataset2_label'] = dataset1_pred_test_labels

print("Dataset 1 (No SMOTE)")
model_no_smote, cm_no_smote, f1_no_smote, precision_no_smote, recall_no_smote, pr_auc_no_smote, acc_no_smote = hyperparameter_tuning(
    X_train_1, y_train_1, X_test, y_test, "Dataset 1 Model (No SMOTE)"
)

print("Dataset 1 (SMOTE) with labels")
smote = SMOTE(random_state=42)
X_train_1_smote, y_train_1_smote = smote.fit_resample(X_train_1, y_train_1)
model_with_smote, cm_with_smote, f1_with_smote, precision_with_smote, recall_with_smote, pr_auc_with_smote, acc_with_smote = hyperparameter_tuning(
    X_train_1_smote, y_train_1_smote, X_test, y_test, "Dataset 1 Model (With SMOTE)"
)


print("Dataset 1 (SMOTE) without labels")
smote = SMOTE(random_state=42)
X_train_1_smote, y_train_1_smote = smote.fit_resample(X_train_1.drop(columns=['dataset2_label']), y_train_1)
model_with_smote1, cm_with_smote1, f1_with_smote1, precision_with_smote1, recall_with_smote1, pr_auc_with_smote1, acc_with_smote1 = hyperparameter_tuning(
    X_train_1_smote, y_train_1_smote, X_test.drop(columns=['dataset2_label']), y_test, "Dataset 1 Model no labels (With SMOTE)"
)


[I 2024-12-06 23:32:09,720] A new study created in memory with name: no-name-d9af9d6e-a99a-4150-9530-5f2b96f27078


Dataset 1 (No SMOTE)


[I 2024-12-06 23:32:10,126] Trial 0 finished with value: 0.8196116016675331 and parameters: {'n_estimators': 53, 'max_depth': 19, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.8196116016675331.
[I 2024-12-06 23:32:11,472] Trial 1 finished with value: 0.8145802031149818 and parameters: {'n_estimators': 181, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8196116016675331.
[I 2024-12-06 23:32:11,580] A new study created in memory with name: no-name-6b9719e5-ca87-4a94-9aa3-58bdfe7fc262


Dataset 1 (SMOTE) with labels


[I 2024-12-06 23:32:12,612] Trial 0 finished with value: 0.8750107606877305 and parameters: {'n_estimators': 53, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': None}. Best is trial 0 with value: 0.8750107606877305.
[I 2024-12-06 23:32:13,325] Trial 1 finished with value: 0.8731590865780975 and parameters: {'n_estimators': 68, 'max_depth': 29, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.8750107606877305.
[I 2024-12-06 23:32:13,600] A new study created in memory with name: no-name-69a3c8cd-908b-4c21-93ef-1d9f645779a4


Dataset 1 (SMOTE) without labels


[I 2024-12-06 23:32:14,705] Trial 0 finished with value: 0.870107925781217 and parameters: {'n_estimators': 106, 'max_depth': 27, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.870107925781217.
[I 2024-12-06 23:32:15,431] Trial 1 finished with value: 0.8670773941636967 and parameters: {'n_estimators': 77, 'max_depth': 17, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 0.870107925781217.
