In [1]:
from collections import Counter
import numpy as np
import pandas as pd

In [2]:
hr = pd.read_csv('HR.csv')

In [3]:
hr.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [4]:
hr = hr.rename(columns={'sales':'department'})

In [5]:
hr['department']=np.where(hr['department']=='support', 'technical', hr['department'])
hr['department']=np.where(hr['department'] =='IT', 'technical', hr['department'])

In [6]:
cat_vars=['department','salary']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(hr[var], prefix=var)
    hr1=hr.join(cat_list)
    hr=hr1

In [7]:
hr.drop(hr.columns[[8,9]], axis=1,inplace=True)

In [8]:
from decision_tree import DecisionTree

In [9]:
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

In [10]:
def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common

In [11]:
class RandomForest:
    def __init__(self, n_trees=20, min_samples_split=2, max_depth=100, n_feats=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth,
                n_feats=self.n_feats,
            )
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)


In [12]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [13]:
cols=['satisfaction_level', 'last_evaluation', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 
      'department_RandD', 'department_hr', 'department_management', 'salary_high', 'salary_low'] 
X=hr[cols].to_numpy()
y=hr['left'].to_numpy()

In [14]:
X.shape

(14999, 10)

In [15]:
  def trainData_testData_split(X, y, testPercentage, seed=1121):
    np.random.seed(seed)
    randomArray = np.random.rand(X.shape[0])
    splittedData = randomArray < np.percentile(randomArray, int((1 - testPercentage)*100))

    X_train = X[splittedData]
    y_train = y[splittedData]
    X_test  = X[~splittedData]
    y_test  = y[~splittedData]
    
    return X_train, X_test, y_train, y_test

In [16]:
X_train, X_test, y_train, y_test = trainData_testData_split(X, y,0.2,1234)

In [17]:
clf = RandomForest(n_trees=10, max_depth=10)

In [18]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)

In [19]:
print("Accuracy:", acc)

Accuracy: 0.97


In [20]:
def cm(y_test, y_pred):
    
    fp = 0
    fn = 0

    tp = 0
    tn = 0

    i = 0
    while(i<len(y_test)):
        # let's first see if it's a true (t) or false prediction (f)
        if y_pred[i] == y_test[i]: # t?
            if y_pred[i] == 1: # tp
                tp += 1
            else: # tn
                tn += 1
        else: # f?
            if y_pred[i] == 1: # fp
                fp += 1
            else: # fn
                fn += 1
        i += 1
        
    our_confusion_matrix = [[tn, fp],
                            [fn, tp]]
    return our_confusion_matrix
# we convert it to numpy array to be printed properly as a matrix

print(np.array(cm(y_test, y_pred)))

[[2234   40]
 [  50  676]]
