<a href="https://colab.research.google.com/github/NUELBUNDI/Machine-Learning-Projects/blob/main/ML_Logistic_Regression_Employee_Attrition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt


In [None]:
url="https://raw.githubusercontent.com/NUELBUNDI/Machine-Learning-Projects/main/Employee-Attrition.csv"

df=pd.read_csv(url)
print(f'Rows x Columns is :{df.shape}')

In [None]:
print(f' The Column Names are :{df.columns}')

In [None]:
# first five Columns
df.head(2)

In [None]:
df.info()

In [None]:
df.columns.tolist()

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df.describe()

## Exploratory Data Analysis and Visualization


In [None]:
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size']=14
matplotlib.rcParams['figure.figsize']=(10,6)
matplotlib.rcParams['figure.facecolor']='#00000000'

In [None]:
def HistogramDistributionInData(name, color):
  fig= px.histogram(df,
                    x=name,
                    marginal='box',
                    title=f'Distribution of {name}',
                    color=color)
  fig.update_layout(bargap=0.1)
  
  return fig.show()


In [None]:
HistogramDistributionInData('Age','Attrition')

The Age is Normal distribution
The staff between the age of 20-35 have high no's of attrition

In [None]:
HistogramDistributionInData('PerformanceRating','Attrition')

High No's of attrition for staff under perfomance rating of 3 compared to performance rating of 4


In [None]:
df.YearsInCurrentRole.unique()

In [None]:
HistogramDistributionInData('YearsInCurrentRole','Attrition')

In [None]:
px.bar(y=df.YearsInCurrentRole.value_counts(),
       title='Count by the No of Years in the Current Role',
       )


In [None]:
HistogramDistributionInData('MonthlyIncome','Attrition')

In [None]:
df.columns

In [None]:
df.drop(columns=['EmployeeNumber'],inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

train_val_df , test_df =train_test_split(df,test_size=0.2, random_state=42)

train_df , val_df =train_test_split(train_val_df,random_state=42)



print(f'df :{df.shape}')

print(f'train_df :{train_df.shape}')

print(f'test_df :{test_df.shape}')

print(f'val_df :{val_df.shape}')

In [None]:
input_cols=[]
for col in df.columns:
  if col != 'Attrition':
    input_cols.append(col)

target_col= 'Attrition'

In [None]:
print(input_cols)

print(target_col)

In [None]:
train_inputs= train_df[input_cols].copy()
train_targets= train_df[target_col]


val_inputs=val_df[input_cols].copy()
val_targets= val_df[target_col].copy()

test_inputs= test_df[input_cols].copy()
test_targets= test_df[target_col].copy()

In [None]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols=train_inputs.select_dtypes('object').columns.tolist()

In [None]:
train_inputs[numeric_cols].describe()

# Scaling Numerical Features



In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler= MinMaxScaler()

In [None]:
scaler.fit(df[numeric_cols])

In [None]:
print(f'Minimum : {list(scaler.data_min_)}')

print(f'Max : {list(scaler.data_max_)}')

In [None]:
train_inputs[numeric_cols]=scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
df.nunique()

In [None]:
df[numeric_cols].nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder= OneHotEncoder(sparse=False,handle_unknown='ignore')

In [None]:
encoder.fit(df[categorical_cols])

In [None]:
encoder.categories_

In [None]:
encoded_cols= list(encoder.get_feature_names_out(categorical_cols))
print(encoded_cols)
print(len(encoded_cols))

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
test_inputs

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model=LogisticRegression(solver='liblinear')

In [None]:
model.fit(train_inputs[numeric_cols + encoded_cols],train_targets)

In [None]:
print(numeric_cols + encoded_cols)

In [None]:
print(model.coef_.tolist())

In [None]:
print(model.intercept_)

### Making Predictions and Evaluating the Model

In [None]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
train_preds= model.predict(X_train)

train_probs=model.predict_proba(X_train)
train_probs

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train_targets,train_preds)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(train_targets, train_preds, normalize='true')

In [None]:
def predict_and_plot(inputs, targets, name=''):
  preds=model.predict(inputs)

  accuracy=accuracy_score(targets, preds)
  print(f'Accuracy : {accuracy*100:.2f}')

  cf =confusion_matrix(targets,preds,normalize ='true')
  plt.figure()
  sns.heatmap(cf,annot=True)
  plt.xlabel('Prediction')
  plt.ylabel('Target')
  plt.title((f'{name} Confusion Matrix'))

  return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Training')

In [None]:
val_preds = predict_and_plot(X_val, val_targets, 'Validatiaon')

In [None]:
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
def random_guess(inputs):
    return np.random.choice(["No", "Yes"], len(inputs))

def all_no(inputs):
  return np.full(len(inputs), "No")


In [None]:
accuracy_score(test_targets, random_guess(X_test))

In [None]:
accuracy_score(test_targets, all_no(X_test))

In [None]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob

In [None]:
# df.head(1).to_dict('records')

In [None]:
new_input= {
    'Age': 20,
  'Attrition': 'Yes',
  'BusinessTravel': 'Travel_Rarely',
  'DailyRate': 102,
  'Department': 'Sales',
  'DistanceFromHome': 2,
  'Education': 2,
  'EducationField': 'Life Sciences',
  'EmployeeCount': 1,
  'EnvironmentSatisfaction': 2,
  'Gender': 'Female',
  'HourlyRate': 94,
  'JobInvolvement': 3,
  'JobLevel': 2,
  'JobRole': 'Sales Executive',
  'JobSatisfaction': 4,
  'MaritalStatus': 'Single',
  'MonthlyIncome': 100000,
  'MonthlyRate': 19479,
  'NumCompaniesWorked': 8,
  'Over18': 'Y',
  'OverTime': 'Yes',
  'PercentSalaryHike': 11,
  'PerformanceRating': 3,
  'RelationshipSatisfaction': 1,
  'StandardHours': 100,
  'StockOptionLevel': 0,
  'TotalWorkingYears': 15,
  'TrainingTimesLastYear': 2,
  'WorkLifeBalance': 1,
  'YearsAtCompany': 5,
  'YearsInCurrentRole': 4,
  'YearsSinceLastPromotion': 5,
  'YearsWithCurrManager': 5}


In [None]:

predict_input(new_input)