##### setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import numpy as np

def preprocess_turnover_data(turnover_trend):
    turnover_trend['year'] = pd.to_datetime(turnover_trend['recorddate_key']).dt.to_period('Y')

    turnover_trend.columns = [col.lower() for col in turnover_trend.columns]
    # get dummies for all categoricals, drop originals
    categorical_features = ['job_title', 'gender_short', 'termreason_desc', 'termtype_desc', 'business_unit',  'department_name']
    turnover_trend = pd.get_dummies(turnover_trend, columns=categorical_features, drop_first=False)
    turnover_trend.columns = [col.lower() for col in turnover_trend.columns]

    # 1 = voluntary turnover, 0 = stayed
    turnover_trend['voluntary_turnover'] = (turnover_trend['termreason_desc_resignaton'] == True).astype(int)

    # drop any unnecessary columns
    i=0
    drop_cols = ['termreason_desc_layoff',
                'termreason_desc_not applicable',
                'termreason_desc_resignaton',
                'termreason_desc_retirement',
                'termtype_desc_involuntary',
                'termtype_desc_not applicable',
                'termtype_desc_voluntary']

    for each in turnover_trend.dtypes:
        if each == 'object':
            drop_cols.append(turnover_trend.columns[i])
        i+=1

    # remove potential model by columns from drop_cols
    model_by_cols = ['city_name', 'store_name']
    drop_cols = [col for col in drop_cols if col not in model_by_cols]
    turnover_trend.drop(columns=drop_cols+['employeeid','status_year'], inplace=True)
    
    return turnover_trend

def yearify_features(df):
    """Convert features to yearly averages or sums as appropriate."""
    df = df.copy()
    df.drop(columns=['city_name', 'store_name'], inplace=True)
    df = df.groupby('year').mean(numeric_only=False)
    df['voluntary_turnover'] = df['voluntary_turnover']*100
    return df

def city_store_yearify(df):
    """Convert features to yearly averages or sums as appropriate."""
    df = df.copy()
    df = df.groupby(['year', 'city_name', 'store_name']).mean(numeric_only=False)
    df.reset_index(inplace=True)
    df.set_index(['year'], inplace=True)
    df['voluntary_turnover'] = df['voluntary_turnover']*100
    return df

def load_and_process_data():
    turnover_trend = pd.read_csv("../data/10yr_turnover.csv")
    turnover_trend.columns = [col.lower() for col in turnover_trend.columns]
    economic_data = pd.read_excel("../data/economic_data.xlsx", sheet_name='annual_data')
    turnover_trend = turnover_trend.merge(economic_data, how='left', left_on='status_year', right_on='year')

    preprocessed_turnover_trend = preprocess_turnover_data(turnover_trend)
    year_df = yearify_features(preprocessed_turnover_trend)
    city_store_yr_df = city_store_yearify(preprocessed_turnover_trend)

    city_store_yr_df = pd.get_dummies(city_store_yr_df, columns=['city_name', 'store_name'], drop_first=False)

    # Fix column names to remove special characters that cause issues with LightGBM
    city_store_yr_df.columns = city_store_yr_df.columns.str.replace('[^A-Za-z0-9_]', '_', regex=True)
    city_store_yr_df.columns = city_store_yr_df.columns.str.replace('__+', '_', regex=True)
    city_store_yr_df.columns = city_store_yr_df.columns.str.strip('_')

    X = city_store_yr_df.drop(['voluntary_turnover'], axis=1)
    y = city_store_yr_df['voluntary_turnover']  # Use actual percentage values
    return X, y

def select_top_features(X, y):
    X = X.copy()
    from sklearn.ensemble import RandomForestRegressor

    keep_cols = []
    for col in X.columns:
        if 'city_name' in col:
            keep_cols.append(col)

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)

    top_features = feature_importance.head(30)['feature'].tolist()
    columns_to_keep = set(keep_cols).union(set(top_features))
    X = X[list(columns_to_keep)]
    return X
def fit_model(X, y):
    import lightgbm as lgb
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define hyperparameter grid for LightGBM
    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [2,3],
        'learning_rate': [0.05, 0.1],
        'num_leaves': [15, 30],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0.1, 1.0, 2.0]
    }

    # Create LightGBM regressor
    lgb_model = lgb.LGBMRegressor(random_state=42, verbose=-1)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=lgb_model,
        param_grid=param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=1
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)
    best_lgbm_model = grid_search.best_estimator_

    # Make predictions
    y_pred = best_lgbm_model.predict(X_test)

    # Calculate metrics
    train_score = best_lgbm_model.score(X_train, y_train)
    test_score = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    print("Best LightGBM Model Results:")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV R² score: {grid_search.best_score_:.4f}")
    print(f"Training R²: {train_score:.4f}")
    print(f"Test R²: {test_score:.4f}")
    print(f"Test MSE: {mse:.4f}")
    print(f"Test RMSE: {rmse:.4f}")
    print(f"Test MAE: {mae:.4f}")
    print(f"R² difference (train - test): {train_score - test_score:.4f}")

    if abs(train_score - test_score) > 0.1:
        print("Warning: Potential overfitting detected!")
    else:
        print("Model appears to generalize well.")
    
    return best_lgbm_model

def get_shap(best_lgbm_model, X):
    # import shap
    import shap
    # Initialize SHAP explainer
    explainer = shap.TreeExplainer(best_lgbm_model)
    shap_values = explainer.shap_values(X)
    shap_values = pd.DataFrame(shap_values, columns=X.columns)
    shap.summary_plot(shap_values.values, X, plot_type="bar", max_display=10)
    shap.summary_plot(shap_values.values, X)
    return shap_values

# city / store / year model

In [None]:
X, y = load_and_process_data()
X = select_top_features(X, y)
best_lgbm_model = fit_model(X, y)

shap_values = get_shap(best_lgbm_model, X)

In [None]:
# Let's try several approaches to reduce overfitting

# 1. Scale the features first - KNN is sensitive to feature scales
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Use much higher k values and simpler approach
param_grid_improved = {
    'n_neighbors': [50, 75, 100, 150],  # Much higher k values
    'weights': ['uniform'],  # Only uniform weights to reduce complexity
    'metric': ['euclidean']  # Only euclidean distance
}

# 3. Create new KNN with regularization-like effect
knn_improved = KNeighborsRegressor()

grid_search_improved = GridSearchCV(
    estimator=knn_improved,
    param_grid=param_grid_improved,
    cv=10,  # More folds for better validation
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Fit with scaled data
grid_search_improved.fit(X_train_scaled, y_train)
best_knn_improved = grid_search_improved.best_estimator_

# Make predictions
y_pred_improved = best_knn_improved.predict(X_test_scaled)

# Calculate metrics
train_score_improved = best_knn_improved.score(X_train_scaled, y_train)
test_score_improved = r2_score(y_test, y_pred_improved)
mse_improved = mean_squared_error(y_test, y_pred_improved)
mae_improved = mean_absolute_error(y_test, y_pred_improved)
rmse_improved = np.sqrt(mse_improved)

print("Improved KNN Model Results:")
print(f"Best parameters: {grid_search_improved.best_params_}")
print(f"Best CV R² score: {grid_search_improved.best_score_:.4f}")
print(f"Training R²: {train_score_improved:.4f}")
print(f"Test R²: {test_score_improved:.4f}")
print(f"Test MSE: {mse_improved:.4f}")
print(f"Test RMSE: {rmse_improved:.4f}")
print(f"Test MAE: {mae_improved:.4f}")
print(f"R² difference (train - test): {train_score_improved - test_score_improved:.4f}")

if abs(train_score_improved - test_score_improved) > 0.1:
    print("Warning: Still overfitting detected!")
else:
    print("Model generalization improved.")
