In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data_path = 'D:\HR_Analytics_Attrition_Prediction\data/WA_Fn-UseC_-HR-Employee-Attrition.csv'  # Adjust path if needed
df = pd.read_csv(data_path)

# Display basic information
print("Dataset Shape:", df.shape)
print("Columns:\n", df.columns)
print("\nMissing Values:\n", df.isnull().sum())

# View the first few rows
df.head()

In [None]:
sns.countplot(data=df, x='Attrition')
plt.title('Attrition Distribution')
plt.show()


In [None]:
sns.countplot(data=df, x='Department', hue='Attrition')
plt.title('Attrition by Department')
plt.xticks(rotation=45)
plt.show()


In [None]:
sns.histplot(data=df, x='MonthlyIncome', hue='Attrition', kde=True)
plt.title('Monthly Income Distribution')
plt.show()


In [None]:
# Basic exploration
print("Dataset Shape:", df.shape)
print("Column Names:", df.columns)
print("First 5 Rows:\n", df.head())

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Summary statistics
print("\nSummary Statistics:\n", df.describe())


In [None]:
# Correct file path
data_path = r'D:\HR_Analytics_Attrition_Prediction\data\WA_Fn-UseC_-HR-Employee-Attrition.csv'

# Load the dataset
import pandas as pd
df = pd.read_csv(data_path)
print("Dataset loaded successfully!")


In [None]:
df.dropna(inplace=True)
print(df)


In [None]:
print(df.shape)


In [None]:
print(df.isnull().sum())


In [None]:
# Check for duplicates
print("Number of duplicate rows:", df.duplicated().sum())

# Drop duplicates if any
df.drop_duplicates(inplace=True)


In [None]:
print(df.dtypes)


In [None]:
from sklearn.model_selection import train_test_split

# Features (X) and Target (y)
X = df.drop(columns=['Attrition'])  # All columns except the target
y = df['Attrition']                # Target column

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)
print("Training Target Shape:", y_train.shape)
print("Testing Target Shape:", y_test.shape)


In [None]:
print(df.dtypes)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Binary columns to encode
binary_cols = ['Attrition', 'Gender', 'Over18', 'OverTime']

# Apply Label Encoding
for col in binary_cols:
    df[col] = le.fit_transform(df[col])
    # Check the first few rows of the updated columns
print(df[binary_cols].head())



In [None]:
# Multi-categorical columns
multi_cat_cols = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']

# Apply One-Hot Encoding
df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)

# Check the first few rows of the updated dataset
print(df.head())


In [None]:
print(df.dtypes)


In [None]:
X = df.drop(columns=['Attrition'])  # Features: everything except 'Attrition'
y = df['Attrition']                # Target: 'Attrition'

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
logistic_model = LogisticRegression()

# Train the model
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

print("Random Forest training complete!")


In [None]:
# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


In [None]:
!pip install xgboost

In [None]:
import xgboost
print("XGBoost installed successfully!")


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize XGBoost with scale_pos_weight to handle class imbalance
xgb_model = XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'scale_pos_weight': [1, 2, 3]
}

# Initialize XGBoost
xgb = XGBClassifier(random_state=42)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='f1', cv=3, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_xgb_model = grid_search.best_estimator_


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize XGBoost with the best parameters
best_xgb_model = XGBClassifier(
    learning_rate=0.01,
    max_depth=3,
    n_estimators=200,
    scale_pos_weight=3,
    random_state=42
)

# Train the model
best_xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_tuned = best_xgb_model.predict(X_test)


In [None]:
# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned))


In [None]:
!pip install imbalanced-learn


In [None]:
import sklearn
import imblearn
from imblearn.over_sampling import SMOTE

print("Scikit-learn version:", sklearn.__version__)
print("Imbalanced-learn version:", imblearn.__version__)
print("SMOTE imported successfully!")


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(columns=['Attrition'])  # Replace 'Attrition' with the actual target column name
y = df['Attrition']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Data split completed!")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(columns=['Attrition'])  # Replace 'Attrition' with the actual target column name
y = df['Attrition']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Data split completed!")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("SMOTE applied successfully!")
print(f"Original training data shape: {X_train.shape}")
print(f"Resampled training data shape: {X_train_smote.shape}")


In [None]:
print(df.dtypes)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# List of binary categorical columns
binary_cols = ['Attrition', 'Gender', 'Over18', 'OverTime']

# Apply Label Encoding
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

print("Binary categorical columns encoded successfully!")


In [None]:
# One-hot encode multi-class categorical columns
multi_class_cols = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']

# Perform one-hot encoding
df = pd.get_dummies(df, columns=multi_class_cols, drop_first=True)

print("Multi-class categorical columns encoded successfully!")


In [None]:
print(df.dtypes)


In [None]:
# Define features and target
X = df.drop(columns=['Attrition'])  # Drop the target column
y = df['Attrition']                # Target column

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Check the shape of the training and testing datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Preview the first few rows of the training data
print("\nFirst few rows of X_train:")
print(X_train.head())

# Check class distribution in the target variable
print("\nClass distribution in y_train:")
print(y_train.value_counts())

print("\nClass distribution in y_test:")
print(y_test.value_counts())



In [None]:
from xgboost import XGBClassifier

# Reinitialize XGBoost model with the best hyperparameters
best_xgb_model = XGBClassifier(
    learning_rate=0.01,  # From hyperparameter tuning
    max_depth=3,         # From hyperparameter tuning
    n_estimators=200,    # From hyperparameter tuning
    scale_pos_weight=3,  # From hyperparameter tuning
    random_state=42
)

# Train the model using the SMOTE balanced data
best_xgb_model.fit(X_train_smote, y_train_smote)

# Evaluate the model on the test data
y_pred_smote = best_xgb_model.predict(X_test)

# Print evaluation metrics
print("Accuracy (SMOTE):", accuracy_score(y_test, y_pred_smote))
print("\nClassification Report (SMOTE):\n", classification_report(y_test, y_pred_smote))
print("\nConfusion Matrix (SMOTE):\n", confusion_matrix(y_test, y_pred_smote))


In [None]:
# Get probabilities instead of class predictions
y_prob_smote = best_xgb_model.predict_proba(X_test)[:, 1]  # Probabilities for class '1'

# Set a new threshold (e.g., 0.3)
threshold = 0.3
y_pred_adjusted = (y_prob_smote >= threshold).astype(int)

# Evaluate the model with the new threshold
print("Accuracy (Adjusted Threshold):", accuracy_score(y_test, y_pred_adjusted))
print("Classification Report (Adjusted Threshold):\n", classification_report(y_test, y_pred_adjusted))
print("Confusion Matrix (Adjusted Threshold):\n", confusion_matrix(y_test, y_pred_adjusted))


In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob_smote)

plt.figure(figsize=(8, 6))
plt.plot(thresholds, precisions[:-1], label='Precision')
plt.plot(thresholds, recalls[:-1], label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid()
plt.show()
