In [1]:
from __future__ import print_function
import os
# Type Path
data_path = [r'C:\Users\wpghk\ai_data']

import pandas as pd
import numpy as np
from time import time
start = time()

filepath = os.sep.join(data_path + ['Human_Resources_Employee_Attrition.csv'])
data = pd.read_csv(filepath, sep=',')

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier 

le = LabelEncoder()
data['salary'] = le.fit_transform(data.salary)
data['department'] = le.fit_transform(data.department)

target = 'salary'
feature_cols = [x for x in data.columns if x != target]

# Split the data into two parts with 1500 points in the test data
# This creates a generator
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# Get the index values from the generator
train_idx, test_idx = next(strat_shuff_split.split(data[feature_cols], data[target]))

# Create the data sets
X_train = data.loc[train_idx, feature_cols]
y_train = data.loc[train_idx, target]

X_test = data.loc[test_idx, feature_cols]
y_test = data.loc[test_idx, target]

def print_score(classifier, X_train, y_train, X_test, y_test, train = True):
    print("-"*100)
    if train == True:
        y_pred = classifier.predict(X_train)
        print("Training result:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, y_pred)))
        print("Classification Report:\n{}\n".format(classification_report(y_train, y_pred)))
        print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_train, y_pred)))
        res = cross_val_score(classifier, X_train, y_train, cv = 10, n_jobs= -1, scoring ="accuracy")
        print("Average Accuracy:\t{0:.4f}\n".format(res.mean()))
        print("Standard Deviation:\t{0:.4f}".format(res.std()))
    elif train == False:
        y_pred = classifier.predict(X_test)
        print("Test result:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, y_pred)))
        print("Classification Report:\n{}\n".format(classification_report(y_test, y_pred)))
        print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test, y_pred)))
        
random_forest = RandomForestClassifier(random_state = 42, criterion='entropy', n_estimators = 140, max_depth = 32, n_jobs= -1)
random_forest.fit(X_train, y_train)


print_score(random_forest ,X_train,y_train,X_test,y_test, train = True)
print_score(random_forest ,X_train,y_train,X_test,y_test, train = False)


print("time: %.2f" % (time()-start))

  from numpy.core.umath_tests import inner1d


----------------------------------------------------------------------------------------------------
Training result:

Accuracy Score: 0.9992

Classification Report:
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       866
          1       1.00      1.00      1.00      5121
          2       1.00      1.00      1.00      4512

avg / total       1.00      1.00      1.00     10499


Confusion Matrix:
[[ 865    0    1]
 [   0 5120    1]
 [   0    6 4506]]

Average Accuracy:	0.6019

Standard Deviation:	0.0105
----------------------------------------------------------------------------------------------------
Test result:

Accuracy Score: 0.6220

Classification Report:
             precision    recall  f1-score   support

          0       0.73      0.32      0.45       371
          1       0.62      0.71      0.67      2195
          2       0.61      0.58      0.59      1934

avg / total       0.63      0.62      0.62      4500


Confusi

In [None]:
k_range = list(range(3200,3205))
scores = []
for k in k_range:
    knn = RandomForestClassifier(max_leaf_nodes = k, random_state = 42, criterion='entropy', n_estimators = 140, max_depth = 32, n_jobs= -1)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    print("%d : %.5f" % (k, accuracy_score(y_test, y_pred)))

    
import matplotlib.pyplot as plt
%matplotlib inline    
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.show()
print(scores)
