In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import seaborn as sns

dataset1 = pd.read_csv('IBM Dataset 1.csv')
dataset2_train = pd.read_csv('datasets/dataset_2_train.csv')
dataset2_test = pd.read_csv('datasets/dataset_2_test.csv')

categorical_mappings = {
    "Attrition": {"Yes": 1, "No": 0, "Left": 1, "Stayed": 0},
    "Education Level": {
        "High School": 1, "Associate Degree": 2, "Bachelor’s Degree": 3,
        "Master’s Degree": 4, "PhD": 5
    },
    "Gender": {"Female": 0, "Male": 1},
    "Job Level": {"Entry": 1, "Mid": 3, "Senior": 5},
    "Job Satisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "Marital Status": {"Single": 0, "Married": 1, "Divorced": 2},
    "Performance Rating": {
        "Low": 1, "Below Average": 2, "Average": 3, "High": 4
    },
    "Work-Life Balance": {
        "Poor": 1, "Fair": 2, "Good": 3, "Excellent": 4
    }
}


def align_dtypes(dataset, target_dtypes):

    dataset = dataset.copy()
    for col, dtype in target_dtypes.items():
        if col in dataset.columns:
            if dtype == "int64":
                dataset[col] = pd.to_numeric(dataset[col], errors="coerce").fillna(0).astype(int)
            elif dtype == "float64":
                dataset[col] = pd.to_numeric(dataset[col], errors="coerce").fillna(0.0).astype(float)
            elif dtype == "object":
                dataset[col] = dataset[col].astype("object")
    return dataset

def apply_mappings(dataset, mappings):
   
    dataset = dataset.copy()
    for feature, mapping in mappings.items():
        if feature in dataset.columns:
            dataset[feature] = dataset[feature].map(mapping)
    return dataset

def align_and_preprocess_datasets(dataset1):
    aligned_features = [
        "Age", "Attrition", "Distance from Home", "Education Level", "Gender",
        "Job Level", "Job Satisfaction", "Marital Status", "Monthly Income",
        "Performance Rating", "Work-Life Balance", "Years at Company"
    ]
    column_mapping = {
        "DistanceFromHome": "Distance from Home",
        "Education": "Education Level",
        "JobLevel": "Job Level",
        "JobSatisfaction": "Job Satisfaction",
        "MonthlyIncome": "Monthly Income",
        "PerformanceRating": "Performance Rating",
        "WorkLifeBalance": "Work-Life Balance",
        "YearsAtCompany": "Years at Company",
        "MaritalStatus": "Marital Status"
    }

    dataset1 = dataset1.rename(columns=column_mapping)

    ## make sure both datasets have same dtypes for common features
    target_dtypes = {
        "Age": "int64",
        "Attrition": "object",
        "Distance from Home": "int64",
        "Education Level": "int64",
        "Gender": "object",
        "Job Level": "int64",
        "Job Satisfaction": "int64",
        "Marital Status": "object",
        "Monthly Income": "int64",
        "Performance Rating": "int64",
        "Work-Life Balance": "int64",
        "Years at Company": "int64",
        "IncomePerJobLevel": "float64",
        "YearsRatio": "float64",
        "JobEducationRatio": "float64"
    }
    
    

    dataset1 = apply_mappings(dataset1, categorical_mappings)

    dataset1 = align_dtypes(dataset1, target_dtypes)

    dataset1 = dataset1[aligned_features]

    return dataset1

dataset1_processed = align_and_preprocess_datasets(dataset1)
dataset2_processed = align_and_preprocess_datasets(dataset2_train)
dataset2_test_processed = align_and_preprocess_datasets(dataset2_test)


## Feature Engineering for common features in dataset 1 and 2
def feature_engineering(df):
    df['IncomePerJobLevel'] = df['Monthly Income'] / (df['Job Level'] + 1)
    df['YearsRatio'] = df['Years at Company'] / (df['Age'] + 1)
    df['JobEducationRatio'] = df['Education Level'] / (df['Job Level'] + 1)
    return df

dataset1_processed = feature_engineering(dataset1_processed)
dataset2_processed = feature_engineering(dataset2_processed)
dataset2_test_processed = feature_engineering(dataset2_test_processed)


## Split Dataset 1 into train and holdout test sets
train_1, test_1 = train_test_split(
    dataset1_processed, test_size=0.2, stratify=dataset1_processed['Attrition'], random_state=42
)

X_train_2 = dataset2_processed.drop(columns=['Attrition'])
y_train_2 = dataset2_processed['Attrition']

X_test_2 = dataset2_test_processed.drop(columns=['Attrition'])
y_test_2 = dataset2_test_processed['Attrition']


X_train_1 = train_1.drop(columns=['Attrition'])
y_train_1 = train_1['Attrition']


X_test = test_1.drop(columns=['Attrition'])
y_test = test_1['Attrition']

y_train_1 = y_train_1.astype(int)
y_test = y_test.astype(int)
y_train_2 = y_train_2.astype(int)
y_test_2 = y_test_2.astype(int)


In [None]:
import optuna
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, precision_recall_curve, auc, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict

def sample_with_class_distribution(X, y, num_samples):
    sample_ratio = num_samples / len(y)    
    X_subset, _, y_subset, _ = train_test_split(
        X, y, train_size=sample_ratio, stratify=y, random_state=42
    )
    return X_subset, y_subset

def scale_numerical_features(X, scaler, numerical_columns):
    X_scaled = X.copy()
    X_scaled[numerical_columns] = scaler.transform(X[numerical_columns])
    return X_scaled

numerical_columns = [
    "Age", "Distance from Home", "Monthly Income", 
    "Years at Company", "IncomePerJobLevel", "YearsRatio", "JobEducationRatio"
]

results = []

def train_and_evaluate_with_optuna(X_train, y_train, X_test, y_test, description, numerical_columns, reuse_model=None):
    if reuse_model is not None:
        model = reuse_model
    else:
        

        def objective(trial):
            n_estimators = trial.suggest_int("n_estimators", 50, 200)
            max_depth = trial.suggest_int("max_depth", 5, 30)
            min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
            min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4)
            max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

            model = RandomForestClassifier(
                random_state=42,
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_features=max_features,
            )

            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            val_f1_scores = []

            for train_idx, val_idx in skf.split(X_train, y_train):
                X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

                model.fit(X_train_cv, y_train_cv)

                y_val_pred = model.predict(X_val_cv)
                f1 = f1_score(y_val_cv, y_val_pred, average="weighted")
                val_f1_scores.append(f1)
            mean_val_f1 = sum(val_f1_scores) / len(val_f1_scores)

            return mean_val_f1

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=50) 

        best_params = study.best_params
        model = RandomForestClassifier(random_state=42, **best_params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        conf_matrix = confusion_matrix(y_test, y_pred)
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba)
        pr_auc = auc(recall_curve, precision_curve)

        results.append({
            "Description": description,
            "Validation F1 score": study.best_trial.value,
            "Weighted F1 Score": f1,
            "Weighted Precision": precision,
            "Weighted Recall": recall,
            "PR-AUC": pr_auc,
            "Confusion Matrix": conf_matrix,
            "Accuracy": accuracy_score(y_test, y_pred)

        })

        return model

def scale_numerical_features(X, scaler, numerical_columns):
    X_scaled = X.copy()
    X_scaled[numerical_columns] = scaler.transform(X[numerical_columns])
    return X_scaled

def evaluate(X_train_1, y_train_1, X_train_2, y_train_2, X_test, y_test, X_test_2, y_test_2, numerical_columns):
    for num_samples in [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]: 
        
        X_train_2_subset, y_train_2_subset = sample_with_class_distribution(X_train_2, y_train_2, num_samples)
        
        X_combined = pd.concat([X_train_1, X_train_2_subset], axis=0)
        y_combined = pd.concat([y_train_1, y_train_2_subset], axis=0)
        
        scaler_combined = StandardScaler()
        scaler_combined.fit(X_combined[numerical_columns])
        X_combined_scaled = scale_numerical_features(X_combined, scaler_combined, numerical_columns)
        X_test_combined_1_scaled = scale_numerical_features(X_test, scaler_combined, numerical_columns)
        X_test_combined_2_scaled = scale_numerical_features(X_test_2, scaler_combined, numerical_columns)
        
        
        trained_model = train_and_evaluate_with_optuna(
            X_combined_scaled, y_combined, X_test_combined_1_scaled, y_test,
            f"Combined Dataset 1 and {num_samples} samples of Dataset 2 (Dataset 1 test)", numerical_columns
        )
        
        train_and_evaluate_with_optuna(
            X_combined_scaled, y_combined, X_test_combined_2_scaled, y_test_2,
            f"Combined Dataset 1 and {num_samples} samples of Dataset 2 (Dataset 2 test)", numerical_columns,
            reuse_model=trained_model
        )

evaluate(
    X_train_1, y_train_1, X_train_2, y_train_2, X_test, y_test, X_test_2, y_test_2, numerical_columns
)


In [None]:
x = pd.DataFrame.from_dict(results)

In [None]:
x.to_csv("ratio_results.csv")