In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve
warnings.filterwarnings("ignore")
%matplotlib inline

# Load data
df = pd.read_csv('HR_comma_sep.csv.txt')

# Initial data exploration
df.head()

# Data Cleaning and Transformation
df = df.rename(columns={'satisfaction_level': 'satisfaction', 'last_evaluation': 'evaluation', 'number_project': 'projectCount',
                        'average_montly_hours': 'averageMonthlyHours', 'time_spend_company': 'yearsAtCompany',
                        'Work_accident': 'workAccident', 'promotion_last_5years': 'promotion', 'sales': 'department', 'left': 'turnover'})

# Data types and description
df.isnull().any(), df.dtypes, round(df.describe(), 2)

# Correlation heatmap
plt.figure(figsize=(15, 10))
corr = df.corr()
sns.heatmap(corr, annot=True)
plt.title('Heatmap of Correlation Matrix')
plt.show()

# Feature Engineering
df1 = df.copy()
df1 = df1.drop(['department', 'salary'], axis=1)

# Train/Test Split
X = df1.drop('turnover', axis=1)
y = df1['turnover']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123, stratify=y)

# Handling Imbalanced Datasets using SMOTE
sm = SMOTE(random_state=12, sampling_strategy=1.0)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

# Model training and evaluation (Logistic Regression)
lr = LogisticRegression()
lr.fit(X_train_sm, y_train_sm)
y_pred_lr = lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))

# Model training and evaluation (Random Forest)
rf = RandomForestClassifier()
rf.fit(X_train_sm, y_train_sm)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

# Confusion matrix and ROC curve for Logistic Regression
cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

fpr, tpr, _ = roc_curve(y_test, lr.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label='Logistic Regression ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
