In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
traindf = pd.read_csv('train_data.csv')
testdf = pd.read_csv('test_data.csv')

In [None]:
traindf.head()


In [None]:
testdf.head()


In [None]:
traindf.isna().sum()


In [None]:
sns.heatmap(testdf.isnull(), yticklabels = False)


In [None]:
from sklearn.preprocessing import LabelEncoder
for column in traindf.columns:
    if traindf[column].dtype == np.float64:
        continue
    traindf[column] = LabelEncoder().fit_transform(traindf[column])

In [None]:
'''
for column in testdf.columns :
    if testdf[column].dtype == np.number:
        continue
    testdf[column] = pd.get_dummies(testdf[column]) '''

In [None]:
cols = ['Id', 'EmployeeNumber', 'Behaviour']
existing_cols = [col for col in cols if col in traindf.columns]
traindf.drop(columns=existing_cols, axis=1, inplace=True)
existing_cols_test = [col for col in cols if col in testdf.columns]
testdf.drop(columns=existing_cols_test, axis=1, inplace=True)


In [None]:
plt.figure(figsize=(14,14))
sns.heatmap(traindf.corr(), annot=True, fmt='.0%')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(traindf.drop('Attrition', axis=1),
                                                   traindf['Attrition'], test_size = 0.30)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=250, random_state=79)

In [None]:
model.fit(X_train, y_train)  #train our model
preds = model.predict(X_test)        #testing our model

In [None]:
if 'Attrition' in testdf.columns:
    testdf.drop(columns=['Attrition'], inplace=True)

train_columns = traindf.columns.drop('Attrition')

categorical_columns = testdf.select_dtypes(include=['object', 'category']).columns

testdf_encoded = pd.get_dummies(testdf, columns=categorical_columns)

testdf_encoded = testdf_encoded.reindex(columns=train_columns, fill_value=0)

test_preds = model.predict(testdf_encoded)


In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, preds)
accuracy

In [None]:
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve

rf_roc_auc = roc_auc_score(y_test, model.predict(X_test))

rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test,model.predict_proba(X_test)[:,1])
plt.figure()

plt.plot(rf_fpr, rf_tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('TMM')
plt.show()

In [None]:
train_prob = model.predict_proba(X_test)[:, 1]
train_prob

In [None]:
test_acc = accuracy_score(y_test[:-19], test_preds)
test_acc

In [None]:
if 'Attrition' in testdf.columns:
    testdf.drop(columns=['Attrition'], inplace=True)
train_columns = traindf.columns.drop('Attrition')
categorical_columns = testdf.select_dtypes(include=['object', 'category']).columns
testdf_encoded = pd.get_dummies(testdf, columns=categorical_columns)
testdf_encoded = testdf_encoded.reindex(columns=train_columns, fill_value=0)
missing_cols = set(train_columns) - set(testdf_encoded.columns)
for col in missing_cols:
    testdf_encoded[col] = 0
testdf_encoded = testdf_encoded[train_columns]
test_prob = model.predict_proba(testdf_encoded)[:, 1]
test_prob


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
if 'Attrition' in testdf.columns:
    testdf.drop(columns=['Attrition'], inplace=True)
train_columns = traindf.columns.drop('Attrition')
categorical_columns = testdf.select_dtypes(include=['object', 'category']).columns
testdf_encoded = pd.get_dummies(testdf, columns=categorical_columns)
testdf_encoded = testdf_encoded.reindex(columns=train_columns, fill_value=0)
missing_cols = set(train_columns) - set(testdf_encoded.columns)
for col in missing_cols:
    testdf_encoded[col] = 0
testdf_encoded = testdf_encoded[train_columns]
rf_roc_auc = roc_auc_score(y_test[:-19], model.predict(testdf_encoded))
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test[:-19], model.predict_proba(testdf_encoded)[:, 1])
plt.figure()
plt.plot(rf_fpr, rf_tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('TMM')
plt.show()
