In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

1. Data preprocessing: feature selection / data normalization
2. Categorize labels
3. Training models

#### Import csv data as dataframe

In [5]:
data = pd.DataFrame(pd.read_csv("basicFeatureX.csv"))

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,v_5057,v_5060,v_5072,v_5069,v_5041,v_5070,v_5058,v_5043,v_5078,...,['AGBP: Fysiotherapeuten'],v_959 NULL,['BIGP: Fysiotherapeuten'],['KWAL_P: Fysiotherapie'],v_5192 NULL,v_993 NULL,v_5193 NULL,v_994 NULL,v_5195 NULL,['AGBP: Verpleegkundigen']
0,0,4.0,4.0,4.0,4.0,0.0,1.5,1.5,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1,8067.0,8446.0,9580.5,10055.0,1987.0,6941.08,5830.56,2745.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,437.0,430.0,430.5,438.0,126.0,162.67,162.0,112.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,28.0,28.0,28.0,28.0,8.0,13.08,13.08,8.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,47.0,46.0,46.0,47.0,5.0,40.11,40.11,3.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


#### Check the features of this dataset

In [7]:
data.columns

Index(['Unnamed: 0', 'v_5057', 'v_5060', 'v_5072', 'v_5069', 'v_5041',
       'v_5070', 'v_5058', 'v_5043', 'v_5078', 'v_5076', 'v_5055', 'v_5053',
       'v_958', 'v_5042', 'v_5044', 'v_959', 'v_5056', 'v_4395', 'v_4385',
       'v_4386', 'v_4396', 'v_5054',
       '['AGBin: Gecombineerde Verpleeginrichtingen']', 'v_993', 'v_4397',
       'v_4398', '['AGBin: Koepels en Beheerstichtingen WLZ']', 'v_5135',
       'v_5085', '['SBI: 87']', 'v_5081', '['AGBin: Ziekenhuizen']', 'v_994',
       'v_5076 NULL', 'v_5078 NULL', 'v_958 NULL', 'v_5077 NULL',
       'v_5196 NULL', 'v_5194 NULL', '['AGBP: Fysiotherapeuten']',
       'v_959 NULL', '['BIGP: Fysiotherapeuten']', '['KWAL_P: Fysiotherapie']',
       'v_5192 NULL', 'v_993 NULL', 'v_5193 NULL', 'v_994 NULL', 'v_5195 NULL',
       '['AGBP: Verpleegkundigen']'],
      dtype='object')

#### Check if there are any null values

In [8]:
data.isna().sum()

Unnamed: 0                                       0
v_5057                                           0
v_5060                                           0
v_5072                                           0
v_5069                                           0
v_5041                                           0
v_5070                                           0
v_5058                                           0
v_5043                                           0
v_5078                                           0
v_5076                                           0
v_5055                                           0
v_5053                                           0
v_958                                            0
v_5042                                           0
v_5044                                           0
v_959                                            0
v_5056                                           0
v_4395                                           0
v_4385                         

We can see that there is no nan values in the data, therefore, no further processing in this regard is needed

#### Remove redundant features that are not necessary for this task

In [9]:
data = data.drop(["Unnamed: 0",
                  "['AGBin: Gecombineerde Verpleeginrichtingen']",
                  "['AGBin: Koepels en Beheerstichtingen WLZ']",
                  "['SBI: 87']",
                  "['AGBin: Ziekenhuizen']",
                  'v_5076 NULL', 
                  'v_5078 NULL', 
                  'v_958 NULL', 
                  'v_5077 NULL',
                  'v_5196 NULL', 
                  'v_5194 NULL', 
                  "['AGBP: Fysiotherapeuten']",
                  'v_959 NULL', 
                  "['BIGP: Fysiotherapeuten']", 
                  "['KWAL_P: Fysiotherapie']",
                  'v_5192 NULL', 
                  'v_993 NULL', 
                  'v_5193 NULL', 
                  'v_994 NULL', 
                  'v_5195 NULL',
                  "['AGBP: Verpleegkundigen']"], axis=1)

#### Split data into training set and label set

In [10]:
y = data["v_5069"]
X = data.drop("v_5069",axis=1)

In [11]:
X.head()

Unnamed: 0,v_5057,v_5060,v_5072,v_5041,v_5070,v_5058,v_5043,v_5078,v_5076,v_5055,...,v_4386,v_4396,v_5054,v_993,v_4397,v_4398,v_5135,v_5085,v_5081,v_994
0,4.0,4.0,4.0,0.0,1.5,1.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0
1,8067.0,8446.0,9580.5,1987.0,6941.08,5830.56,2745.0,100.0,85.5,4227.0,...,226200000.0,295265000.0,2995.55,93262000.0,8093000.0,8831000.0,77.0,0.0,19.84,58885000.0
2,437.0,430.0,430.5,126.0,162.67,162.0,112.0,100.0,82.0,112.0,...,6460035.0,8456535.0,29.67,1146546.0,160788.0,182700.0,2.0,10.0,43.0,757566.0
3,28.0,28.0,28.0,8.0,13.08,13.08,8.0,100.0,90.0,8.0,...,526588.0,740820.0,3.11,73841.0,259287.0,153338.0,3.0,10.0,15.0,265314.0
4,47.0,46.0,46.0,5.0,40.11,40.11,3.0,0.0,0.0,3.0,...,0.0,0.0,4.55,0.0,0.0,0.0,0.0,99.9,0.0,0.0


#### Get the max and min value of num of employees in order to decide the categories for output

In [12]:
y.max()

15958.0

In [13]:
y.min()

0.0

#### Categorize labels: 1-10 / 10-100 / 100-1000 / 1000-10000 / 10000-20000. In this case, since maximum number of employees is 15958, we set the cap to 20000.


In [14]:
bins = [0. ,10. ,100. ,1000. ,10000. ,20000.]
groups = [1 ,2 ,3 ,4 ,5]
catgory_label = pd.cut(y, bins, labels=groups, include_lowest=True)

In [15]:
catgory_label.value_counts()

1    1103
2     660
3     354
4     211
5       6
Name: v_5069, dtype: int64

From the frequencies, we can see that only 5 companies have over 10000 employees, thus it is reasonable to make only make one big interval for numbers above 10000.

#### Normalize features

In [16]:
temp = X.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(temp)
X = pd.DataFrame(x_scaled)

#### Split data into train and test set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,catgory_label, test_size=0.2)

#### Build a ordinal regression model using KNN Algorithm

In [18]:
clf_knn = KNeighborsClassifier(n_neighbors=3)

In [19]:
clf_knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [20]:
y_pred = clf_knn.predict(X_test)

In [21]:
accuracy_score(y_test, y_pred)

0.7944325481798715

#### Build a ordinal regression model using Naive Bayes

In [24]:
clf_gnb = GaussianNB()

In [25]:
clf_gnb.fit(X_train, y_train)

GaussianNB()

In [26]:
y_pred = clf_gnb.predict(X_test)

In [27]:
accuracy_score(y_test, y_pred)

0.9014989293361885

#### Build a ordinal regression model using Random Forest

In [327]:
clf_rf = RandomForestClassifier(n_estimators=100)

In [328]:
clf_rf.fit(X_train, y_train)

RandomForestClassifier()

In [329]:
y_pred = clf_rf.predict(X_test)

In [330]:
accuracy_score(y_test, y_pred)

0.9764453961456103

From the results above, we can see that random forrest performs better with accuracy reaching at over 97%. Therefore, random forest can be used to construct an ordinal regression model to predict the number of employees.