In [3]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 325.1 kB/s eta 0:07:42
   ---------------------------------------- 0.1/150.0 MB 726.2 kB/s eta 0:03:27
   ---------------------------------------- 0.1/150.0 MB 708.1 kB/s eta 0:03:32
   ---------------------------------------- 0.2/150.0 MB 981.9 kB/s eta 0:02:33
   ---------------------------------------- 0.3/150.0 MB 944.1 kB/s eta 0:02:39
   ---------------------------------------- 0.4/150.0 MB 1.1 MB/s eta 0:02:15
   ---------------------------------------- 0.5/150.0 MB 1.2 MB/s eta 0:02:02
   ---------------------------------------- 0.5/150.0 MB 1.2 MB/s eta 0:02:09
   --

In [13]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb

# Assuming df is your DataFrame, check its columns
print(df.columns)

# Step 1: Define the Problem
# Target variable: Attrition
# Features: Age, JobSatisfaction, PerformanceRating, MonthlyIncome, WorkLifeBalance, etc.

# Create feature set X and target variable y
X = df[['Age', 'JobSatisfaction', 'PerformanceRating', 'MonthlyIncome', 'WorkLifeBalance', 'YearsAtCompany', 'TotalWorkingYears']]
y = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)  # Convert 'Yes'/'No' to 1/0

# Step 2: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing continuous features (for SVM, k-NN, etc.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Model Building
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(probability=True, class_weight='balanced')
}

# Train and evaluate each model
results = {}

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Store results
    results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': classification_report(y_test, y_pred, output_dict=True, zero_division=1)['macro avg']['precision'],
        'Recall': classification_report(y_test, y_pred, output_dict=True, zero_division=1)['macro avg']['recall'],
        'F1-Score': classification_report(y_test, y_pred, output_dict=True, zero_division=1)['macro avg']['f1-score'],
        'ROC-AUC': roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])
    }
    
    print(f"Model: {model_name}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, zero_division=1))
    print('-' * 50)

# Step 4: Model Optimization
# Hyperparameter tuning using GridSearchCV for Random Forest (as an example)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)
print("Best Hyperparameters for Random Forest:", grid_search.best_params_)

# Train the best model and evaluate
best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test_scaled)

print(f"Random Forest - Optimized Model Evaluation:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, zero_division=1))

# Step 5: Model Evaluation
# Compare models based on accuracy, precision, recall, f1-score, and ROC-AUC
print("Model Comparison:")
for model_name, metrics in results.items():
    print(f"{model_name}: Accuracy={metrics['Accuracy']:.4f}, Precision={metrics['Precision']:.4f}, Recall={metrics['Recall']:.4f}, F1-Score={metrics['F1-Score']:.4f}, ROC-AUC={metrics['ROC-AUC']:.4f}")


Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
Model: Logistic Regression
[[159  96]
 [ 15  24]]
              precision    recall  f1-score   support

           0       0.91      0.62      0.74       255
           1       0.20      0.62      0.30        39

    accuracy                           0.62       294
   macro avg       0.

In [19]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

# Assuming you have a dataset named df
# Let's first look at the columns
print(df.columns)

# Step 1: Feature Engineering
# Creating a pipeline for preprocessing the data
# Let's assume 'Age', 'JobSatisfaction', 'PerformanceRating' are continuous variables
# and other variables are categorical like 'Gender', 'JobRole', etc.

# Select the features and target
X = df[['Age', 'JobSatisfaction', 'PerformanceRating', 'MonthlyIncome', 'WorkLifeBalance', 'JobRole', 'Gender']]
y = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Preprocessing Pipeline
# We will handle missing values and perform encoding and scaling using a pipeline
# Categorical variables will be one-hot encoded, continuous variables will be scaled.

# Preprocessing for numeric features
numeric_features = ['Age', 'JobSatisfaction', 'PerformanceRating', 'MonthlyIncome', 'WorkLifeBalance']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Scaling the numeric data
])

# Preprocessing for categorical features
categorical_features = ['JobRole', 'Gender']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One hot encoding
])

# Combine both into a single column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 3: Model Building with RandomForestClassifier as an example
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 4: Hyperparameter Tuning (Optional)
# If needed, you can tune the model using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Evaluate the optimized model
best_model = grid_search.best_estimator_
y_pred_optimized = best_model.predict(X_test)
print("Optimized Classification Report:")
print(classification_report(y_test, y_pred_optimized))

# Step 5: Future Prediction
# Now let's predict for future data. Let's assume we have a new employee data (future data)

# Example future data (new employees or unseen data)
future_data = pd.DataFrame({
    'Age': [29, 40],
    'JobSatisfaction': [3, 2],
    'PerformanceRating': [4, 3],
    'MonthlyIncome': [4500, 6000],
    'WorkLifeBalance': [3, 2],
    'JobRole': ['Sales Executive', 'Research Scientist'],
    'Gender': ['Male', 'Female']
})

# Predict attrition for future data
future_predictions = best_model.predict(future_data)
future_predictions_proba = best_model.predict_proba(future_data)[:, 1]  # Probability of 'Attrition = 1'

# Display predictions
future_predictions_df = future_data.copy()
future_predictions_df['Attrition_Prediction'] = future_predictions
future_predictions_df['Attrition_Probability'] = future_predictions_proba
print(future_predictions_df)


Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       255
           1       0.33      0.05      0.09        39

    accuracy                           0.86       294
   macro avg       0.60      0.52      0.51     