In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://github.com/dsrscientist/IBM_HR_Attrition_Rate_Analytics/raw/main/WA_Fn-UseC_-HR-Employee-Attrition.csv"
df = pd.read_csv(url)

# Data Cleaning
# ... (handle missing values, outliers, and encoding)

# Feature Selection
# ... (identify relevant features)

# Split data into features (X) and target variable (y)
X = df.drop("Attrition", axis=1)
y = df["Attrition"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Interpretation and Recommendations
# ... (analyze results and provide recommendations)

# Documentation
# ... (create visualizations, reports, and documentation)


In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
url = "https://github.com/dsrscientist/IBM_HR_Attrition_Rate_Analytics/raw/main/WA_Fn-UseC_-HR-Employee-Attrition.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset
print(df.head())

# Display the basic information about the dataset
print(df.info())

# Summary statistics
print(df.describe())

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Check for duplicates
print("Duplicate Rows:", df.duplicated().sum())

# Visualize outliers using box plots
plt.figure(figsize=(15, 8))
sns.boxplot(data=df, orient="h")
plt.title("Boxplot of Numerical Features")
plt.show()


In [None]:
# Handling missing values
# For simplicity, let's fill missing numerical values with the mean and missing categorical values with the mode.
df.fillna(df.mean(), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Checking again for missing values after handling them
print("Missing Values:\n", df.isnull().sum())

# Removing duplicates
df.drop_duplicates(inplace=True)

# Checking again for duplicates after removal
print("Duplicate Rows:", df.duplicated().sum())

# Encoding categorical variables
# Using one-hot encoding for simplicity. You might choose a different encoding method based on your analysis.
df_encoded = pd.get_dummies(df, drop_first=True)

# Display the first few rows of the encoded dataset
print(df_encoded.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for seaborn plots
sns.set(style="whitegrid")

# Visualize the distribution of key variables
plt.figure(figsize=(15, 10))

# Example: Age distribution
plt.subplot(2, 2, 1)
sns.histplot(df_encoded['Age'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Age')

# Example: MonthlyIncome distribution
plt.subplot(2, 2, 2)
sns.histplot(df_encoded['MonthlyIncome'], bins=30, kde=True, color='salmon')
plt.title('Distribution of MonthlyIncome')

# Analyze the relationship between different features and attrition
plt.figure(figsize=(15, 6))

# Example: Boxplot of MonthlyIncome vs. Attrition
plt.subplot(1, 2, 1)
sns.boxplot(x='Attrition_Yes', y='MonthlyIncome', data=df_encoded, palette='viridis')
plt.title('MonthlyIncome vs. Attrition')

# Example: Countplot of JobRole with hue as Attrition
plt.subplot(1, 2, 2)
sns.countplot(x='JobRole', hue='Attrition_Yes', data=df_encoded, palette='pastel')
plt.title('JobRole vs. Attrition')

# Identify patterns or trends related to attrition
plt.figure(figsize=(12, 8))

# Example: Pairplot with hue as Attrition
sns.pairplot(df_encoded[['Age', 'MonthlyIncome', 'JobSatisfaction', 'WorkLifeBalance', 'Attrition_Yes']], hue='Attrition_Yes', palette='husl')
plt.title('Pairplot of Key Variables with Attrition')

plt.show()


In [None]:
# Correlation analysis
correlation_matrix = df_encoded.corr()

# Visualize the correlation matrix
plt.figure(figsize=(18, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Identify features with high correlation to Attrition
corr_with_target = correlation_matrix['Attrition_Yes'].sort_values(ascending=False)
print("Correlation with Attrition:\n", corr_with_target)

# Selecting features with absolute correlation greater than a threshold (e.g., 0.1)
selected_features_corr = corr_with_target[abs(corr_with_target) > 0.1].index.tolist()
print("Selected Features based on Correlation:\n", selected_features_corr)

# Feature importance using a machine learning model (e.g., Random Forest)
from sklearn.ensemble import RandomForestClassifier

# Split data into features (X) and target variable (y)
X = df_encoded.drop("Attrition_Yes", axis=1)
y = df_encoded["Attrition_Yes"]

# Train a Random Forest model
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X, y)

# Get feature importances
feature_importances = model_rf.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Visualize feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importances (Random Forest)')
plt.show()

# Selecting features with importance greater than a threshold (e.g., 0.02)
selected_features_rf = feature_importance_df[feature_importance_df['Importance'] > 0.02]['Feature'].tolist()
print("Selected Features based on Random Forest Feature Importance:\n", selected_features_rf)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Features selected based on Random Forest feature importance
selected_features = selected_features_rf

# Split the dataset into features (X) and target variable (y)
X = df_encoded[selected_features]
y = df_encoded["Attrition_Yes"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a machine learning model (Random Forest)
model = RandomForestClassifier(random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nConfusion Matrix:\n", conf_matrix)

# Visualize confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
