In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# File path to save figures
save_path = r"C:\Users\georg\Documents\BPP\Professional Practice\Figures"

# Load dataset
df = pd.read_csv(r"C:\Users\georg\Documents\BPP\Professional Practice\IBM Employee Attrition.csv")

# Remove duplicates
df = df.drop_duplicates()

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime'])

# Create a copy of the encoded dataset for standardization
df_standardized = df_encoded.copy()

# Select numerical columns for standardization
numerical_columns = [
    'Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
    'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
    'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating',
    'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
    'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
    'YearsWithCurrManager'
]

# Initialize the scaler and standardize the selected columns
scaler = StandardScaler()
df_standardized[numerical_columns] = scaler.fit_transform(df_standardized[numerical_columns])

# Convert boolean columns to integers
boolean_columns = df_encoded.select_dtypes(include='bool').columns
df_encoded[boolean_columns] = df_encoded[boolean_columns].astype(int)
df_standardized[boolean_columns] = df_standardized[boolean_columns].astype(int)

# Define relevant variables and datasets
relevant_vars = [
    'Attrition_Yes', 'JobSatisfaction', 'OverTime_Yes', 'DistanceFromHome', 'MonthlyIncome', 
    'Age', 'EnvironmentSatisfaction', 'WorkLifeBalance', 'YearsAtCompany'
]
df_encoded_relevant = df_encoded[relevant_vars]
df_standardized_relevant = df_standardized[relevant_vars]

# Plotting correlation matrices
def plot_correlation_matrix(df, title, file_name):
    plt.figure(figsize=(12, 10))
    sns.heatmap(df.corr().round(2), annot=True, cmap='coolwarm', annot_kws={"size": 8})
    plt.title(title)
    plt.savefig(f"{save_path}/{file_name}.png")
    plt.close()

plot_correlation_matrix(df_encoded_relevant, 'Correlation Matrix (Original Encoded Data - Relevant Variables)', 'corr_matrix_encoded_relevant')
plot_correlation_matrix(df_standardized_relevant, 'Correlation Matrix (Standardized Data - Relevant Variables)', 'corr_matrix_standardized_relevant')

# Plotting distributions
def plot_distributions(df, columns, file_prefix):
    for column in columns:
        plt.figure(figsize=(12, 6))
        sns.histplot(df, x=column, hue='Attrition_Yes', multiple='stack', bins=30)
        plt.title(f'Distribution of {column} by Attrition')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.legend(title='Attrition')
        plt.savefig(f"{save_path}/{file_prefix}_{column}_distribution.png")
        plt.close()

plot_distributions(df_encoded, ['DistanceFromHome', 'JobSatisfaction', 'MonthlyIncome', 'YearsAtCompany'], 'distribution')

# Plotting box plots
def plot_box_plots(df, columns, file_prefix):
    for column in columns:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='Attrition_Yes', y=column, data=df)
        plt.title(f'Box Plot of {column} by Attrition')
        plt.xlabel('Attrition')
        plt.ylabel(column)
        plt.savefig(f"{save_path}/{file_prefix}_{column}_boxplot.png")
        plt.close()

plot_box_plots(df_encoded, ['DistanceFromHome', 'JobSatisfaction', 'MonthlyIncome', 'YearsAtCompany'], 'boxplot')

# Plotting count plots
def plot_count_plots(df, columns, file_prefix):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(x=column, data=df, hue='Attrition_Yes')
        plt.title(f'Count Plot of {column} by Attrition')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.legend(title='Attrition')
        plt.savefig(f"{save_path}/{file_prefix}_{column}_countplot.png")
        plt.close()

plot_count_plots(df_encoded, ['OverTime_Yes', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely'], 'countplot')

# Define features and target variable
features = [
    'Age', 'DistanceFromHome', 'JobSatisfaction', 'MonthlyIncome', 'YearsAtCompany',
    'OverTime_Yes', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely'
]
target = 'Attrition_Yes'

# Ensure the columns exist in the DataFrame
features = [col for col in features if col in df_encoded.columns]
if target not in df_encoded.columns:
    raise ValueError(f"Target column '{target}' not found in the DataFrame")

X = df_encoded[features]
y = df_encoded[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Support Vector Machine': SVC(class_weight='balanced', random_state=42)
}

# Evaluate models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}\n")


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
