## NAME:RISHABH JAIN
## E-mail: jainrishabhj68@gmail.com

In [2]:
import pandas as pd
df = pd.read_csv('Employee.csv')

In [3]:
df.isnull().sum()

MonthlyIncome      19
Age                20
JobSatisfaction    18
YearsAtCompany     18
OverTime            0
Attrition          19
dtype: int64

In [4]:
df=df.dropna(subset=['Attrition'])

In [5]:
df.fillna({
    'Age': df['Age'].median(),
    'YearsAtCompany': df['YearsAtCompany'].median(),
    'MonthlyIncome': df['MonthlyIncome'].median(),
    'JobSatisfaction': df['JobSatisfaction'].mode()[0]
}, inplace=True)

In [6]:
df.isnull().sum()

MonthlyIncome      0
Age                0
JobSatisfaction    0
YearsAtCompany     0
OverTime           0
Attrition          0
dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample

In [8]:
le = LabelEncoder()
df['OverTime'] = le.fit_transform(df['OverTime'])
df = df[df['Attrition'].isin([0, 1])]
df['Attrition'] = df['Attrition'].astype(int)
X = df[['MonthlyIncome', 'Age', 'JobSatisfaction', 'YearsAtCompany', 'OverTime']]
y = df['Attrition']
print(X.head())
print(y.head())

   MonthlyIncome   Age  JobSatisfaction  YearsAtCompany  OverTime
0        13191.0  41.0              4.0        1.000000         1
1        15859.0  44.0              4.0       17.000000         1
4         3885.0  27.0              4.0        0.000000         1
6        12511.5  41.0              2.0       38.528277         1
8         4670.0  53.0              4.0        9.000000         1
0    1
1    0
4    1
6    1
8    0
Name: Attrition, dtype: int32


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
print("\nLogistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))


Logistic Regression Results
Accuracy: 0.34285714285714286
              precision    recall  f1-score   support

           0       0.29      0.10      0.15        20
           1       0.36      0.67      0.47        15

    accuracy                           0.34        35
   macro avg       0.32      0.38      0.31        35
weighted avg       0.32      0.34      0.28        35



In [11]:
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)
y_pred_dtree = dtree.predict(X_test)
print("\nDecision Tree Results")
print("Accuracy:", accuracy_score(y_test, y_pred_dtree))
print(classification_report(y_test, y_pred_dtree))


Decision Tree Results
Accuracy: 0.37142857142857144
              precision    recall  f1-score   support

           0       0.42      0.25      0.31        20
           1       0.35      0.53      0.42        15

    accuracy                           0.37        35
   macro avg       0.38      0.39      0.37        35
weighted avg       0.39      0.37      0.36        35



In [12]:
df['OverTime'] = LabelEncoder().fit_transform(df['OverTime'])
df = df[df['Attrition'].isin([0, 1])]
df['Attrition'] = df['Attrition'].astype(int)
df_majority = df[df['Attrition'] == 0]
df_minority = df[df['Attrition'] == 1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)
df_balanced = pd.concat([df_majority, df_minority_upsampled])
X = df_balanced[['MonthlyIncome', 'Age', 'JobSatisfaction', 'YearsAtCompany', 'OverTime']]
y = df_balanced['Attrition']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
dtree = DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=42)
dtree.fit(X_train, y_train)
y_pred_dtree = dtree.predict(X_test)
print("\nLogistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))
print("\nDecision Tree")
print("Accuracy:", accuracy_score(y_test, y_pred_dtree))
print(classification_report(y_test, y_pred_dtree))


Logistic Regression
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.47      0.47      0.47        15
           1       0.53      0.53      0.53        17

    accuracy                           0.50        32
   macro avg       0.50      0.50      0.50        32
weighted avg       0.50      0.50      0.50        32


Decision Tree
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.48      0.67      0.56        15
           1       0.55      0.35      0.43        17

    accuracy                           0.50        32
   macro avg       0.51      0.51      0.49        32
weighted avg       0.51      0.50      0.49        32

