In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [21]:
# Load your dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    print("Columns in the dataset:", data.columns)  # Print column names to check
    data.columns = data.columns.str.strip()  # Remove leading/trailing spaces
    return data

In [23]:
# Preprocess the data
def preprocess_data(data):
    # Define features and target variable
    X = data.drop('Attrition', axis=1)  # Features: all columns except 'Attrition'
    y = data['Attrition']  # Target variable: 'Attrition'
    
    # Convert categorical variables to numerical
    X = pd.get_dummies(X, drop_first=True)  # Apply one-hot encoding to features(converts categorical variables (like "Gender", "Job Role") into numerical format )
    
    return X, y

In [25]:
# Train the model
def train_model(X, y):

    #Random Forests create multiple decision trees by randomly selecting subsets of the data and features, 
    # and setting a fixed random_state ensures that the same trees are generated every time( random_state = 42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # test_size = 0.2 meaning test data 20 % 
    
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f'Model Accuracy: {accuracy:.2f}')
    
    return model

In [26]:
# Save the model
def save_model(model, filename):
    joblib.dump(model, filename)

In [27]:
# Main function to execute the training
if __name__ == '__main__':
    # Load and preprocess data
    data = load_data('dataset.csv')  # Replace with your dataset path
    X, y = preprocess_data(data)
    
    # Train and save the model
    model = train_model(X, y)
    save_model(model, 'attrition_model.pkl')

Columns in the dataset: Index(['Attrition', 'Business Travel', 'Department', 'Education Field',
       'emp no', 'Employee Number', 'Gender', 'Job Role', 'Marital Status',
       'Over Time', 'Training Times Last Year', 'Age', 'Distance From Home',
       'Education', 'Environment Satisfaction', 'Job Satisfaction',
       'Monthly Income', 'Num Companies Worked', 'Percent Salary Hike',
       'Relationship Satisfaction', 'Total Working Years', 'Years At Company',
       'Years In Current Role', 'Years With Curr Manager'],
      dtype='object')
Model Accuracy: 0.87
