# Advanced Employee Turnover Prediction Notebook
This notebook contains the full pipeline with hyperparameter tuning and SMOTE.

Collecting scikit-learn==0.13
  Downloading scikit-learn-0.13.tar.gz (3.5 MB)
     ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
     - -------------------------------------- 0.1/3.5 MB 1.4 MB/s eta 0:00:03
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.5 MB 8.3 MB/s eta 0:00:01


  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [1003 lines of output]
  Partial import of sklearn during the build process.
  
    `numpy.distutils` is deprecated since NumPy 1.23.0, as a result
    of the deprecation of `distutils` itself. It will be removed for
    Python >= 3.12. For older Python versions it will remain present.
    It is recommended to use `setuptools < 60.0` for those Python versions.
    For more details, see:
      https://numpy.org/devdocs/reference/distutils_status_migration.html
  
  
    from numpy.distutils.core import setup
  Copying source tree into build/py3k for 2to3 transformation...
    import lib2to3.main
  Converting to Python3 via 2to3...
  RefactoringTool: Skipping optional fixer: buffer
  RefactoringTool: Skipping optional fixer: idioms
  RefactoringTool: Skipping optional fixer: set_literal
  RefactoringTool: Skipping optional fixer: ws_comma
  RefactoringTool: Refactored C:\U

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import pickle
import warnings

warnings.filterwarnings('ignore')
RANDOM_STATE = 42


ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\Rajiv Arora\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [14]:
def load_data() -> pd.DataFrame:
    """Load turnover.csv from script directory or CWD."""
    script_dir = Path().resolve()
    cwd = Path.cwd().resolve()
    for location in (script_dir / "turnover.csv", cwd / "turnover.csv"):
        if location.exists():
            df = pd.read_csv(location, encoding="ISO-8859-1")
            print(f"Loaded {df.shape[0]}×{df.shape[1]} from {location}")
            return df
    raise FileNotFoundError("turnover.csv not found.")

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['tenure_years'] = df['stag'] / 12.0
    df['age_group'] = pd.cut(df['age'], bins=[0,25,35,45,100], labels=['Young','Mid','Senior','Veteran'])
    df['personality_score'] = df[['extraversion','independ','selfcontrol','anxiety','novator']].mean(axis=1)
    df['stress_level'] = df['anxiety'] + df['selfcontrol']
    df['adaptability'] = df['novator'] + df['extraversion']
    df['tenure_age_ratio'] = df['tenure_years'] / df['age']
    df['coaching_impact'] = (df['coach']=='yes').astype(int) * df['personality_score']
    df['high_risk'] = ((df['stag']<12)&(df['age']<30)).astype(int)
    df['low_risk'] = ((df['stag']>60)&(df['age']>35)).astype(int)
    return df

def build_preprocessor(cat_cols, num_cols):
    return ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', StandardScaler(), num_cols)
    ])

def tune_classifier(pipeline, param_grid, X_train, y_train, name):
    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    print(f"Best {name} params: {grid.best_params_}")
    return grid.best_estimator_


In [15]:
# Load and prepare data
df = load_data()
df = feature_engineering(df)
y = df.pop('event')
X = df

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)

# Define columns
cat_cols = ['gender','industry','profession','traffic','coach','head_gender','greywage','way','age_group']
num_cols = [c for c in X.columns if c not in cat_cols]

# Preprocessor
preproc = build_preprocessor(cat_cols, num_cols)

# Pipelines and parameter grids
rf_pipe = ImbPipeline([('prep', preproc), ('select', SelectKBest(f_classif, k=50)), ('smote', SMOTE(random_state=RANDOM_STATE)), ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))])
rf_params = {'clf__n_estimators':[200,300], 'clf__max_depth':[10,15], 'clf__min_samples_split':[4,5], 'clf__min_samples_leaf':[1,2]}

gb_pipe = ImbPipeline([('prep', preproc), ('select', SelectKBest(f_classif, k=50)), ('smote', SMOTE(random_state=RANDOM_STATE)), ('clf', GradientBoostingClassifier(random_state=RANDOM_STATE))])
gb_params = {'clf__n_estimators':[100,200], 'clf__learning_rate':[0.05,0.1], 'clf__max_depth':[6,8], 'clf__min_samples_split':[3,5], 'clf__min_samples_leaf':[1,3]}

# Tune models
best_rf = tune_classifier(rf_pipe, rf_params, X_train, y_train, 'RandomForest')
best_gb = tune_classifier(gb_pipe, gb_params, X_train, y_train, 'GradientBoosting')

# Final ensemble
vote_clf = VotingClassifier(estimators=[('rf', best_rf.named_steps['clf']), ('gb', best_gb.named_steps['clf'])], voting='soft', n_jobs=-1)
final_pipe = ImbPipeline([('prep', preproc), ('select', SelectKBest(f_classif, k=50)), ('smote', SMOTE(random_state=RANDOM_STATE)), ('vote', vote_clf)])
final_pipe.fit(X_train, y_train)

# Evaluate
y_pred = final_pipe.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('ROC AUC:', roc_auc_score(y_test, final_pipe.predict_proba(X_test)[:,1]))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Save model
with open('employee_turnover_optimized.pkl','wb') as f:
    pickle.dump(final_pipe, f)


Loaded 1129×16 from C:\Users\Rajiv Arora\Desktop\Random Forest\turnover.csv


NameError: name 'RANDOM_STATE' is not defined