### Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [3]:
data_df = pd.read_csv("./Data/train.csv")

In [4]:
data_df.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [5]:
data_df.shape

(61878, 95)

### Null Counts Check

In [6]:
null_df = pd.DataFrame(data_df.isnull().sum())

In [7]:
null_df

Unnamed: 0,0
id,0
feat_1,0
feat_2,0
feat_3,0
feat_4,0
...,...
feat_90,0
feat_91,0
feat_92,0
feat_93,0


In [8]:
null_df.to_csv("./Output CSVs/null_df.csv")

### Class Distribution

In [8]:
data_df['target'].value_counts()

Class_2    16122
Class_6    14135
Class_8     8464
Class_3     8004
Class_9     4955
Class_7     2839
Class_5     2739
Class_4     2691
Class_1     1929
Name: target, dtype: int64

Imbalanced dataset with some major occupancy class

### Raw data input to model training

#### Encode target

In [9]:
le = preprocessing.LabelEncoder()

In [10]:
data_df['target'] = le.fit_transform(data_df['target'])

In [11]:
data_df.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


#### Target and feature split

In [12]:
X = data_df.drop(['target'],axis=1)
y = data_df['target']

In [13]:
X.shape

(61878, 94)

In [14]:
X.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,1,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,1,6,1,5,0,0,...,22,0,1,2,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [15]:
y

0        0
1        0
2        0
3        0
4        0
        ..
61873    8
61874    8
61875    8
61876    8
61877    8
Name: target, Length: 61878, dtype: int32

#### Train Test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.30, random_state=42)

In [18]:
X_train.shape

(43314, 94)

In [19]:
X_test.shape

(18564, 94)

#### Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=42)

In [29]:
from sklearn.metrics import classification_report

In [30]:
y_train_pred = classifier.predict(X_train)

In [33]:
y_train_pred

array([7, 1, 6, ..., 0, 1, 7])

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
print("Training Metrics")
print(classification_report(y_train, y_train_pred))
print(accuracy_score(y_train, y_train_pred))

Training Metrics
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1342
           1       1.00      1.00      1.00     11322
           2       1.00      1.00      1.00      5643
           3       1.00      1.00      1.00      1904
           4       1.00      1.00      1.00      1894
           5       1.00      1.00      1.00      9845
           6       1.00      1.00      1.00      1993
           7       1.00      1.00      1.00      5974
           8       1.00      1.00      1.00      3397

    accuracy                           1.00     43314
   macro avg       1.00      1.00      1.00     43314
weighted avg       1.00      1.00      1.00     43314

0.9996075171999815


In [36]:
y_test_pred = classifier.predict(X_test)

In [37]:
print("Test Metrics")
print(classification_report(y_test, y_test_pred))
print(accuracy_score(y_test, y_test_pred))

Test Metrics
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       587
           1       0.98      1.00      0.99      4800
           2       0.97      0.98      0.97      2361
           3       0.95      0.91      0.93       787
           4       0.99      0.99      0.99       845
           5       0.98      0.99      0.98      4290
           6       0.94      0.86      0.90       846
           7       0.97      0.98      0.97      2490
           8       0.98      0.96      0.97      1558

    accuracy                           0.98     18564
   macro avg       0.97      0.96      0.96     18564
weighted avg       0.98      0.98      0.98     18564

0.9753285929756518
