# Final Project

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from sklearn import preprocessing as prep
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix as cm
from tkinter import *
from tkinter import filedialog

## Data preprocessing

In [3]:
# load and the dataset
file_name=filedialog.askopenfilename()
data=pd.read_csv(file_name,skiprows=0,encoding='utf-8')

# Using MinMax to precroess the data
df=data.iloc[:,:-1]
df=(df-df.min(axis=0))/(df.max(axis=0)-df.min(axis=0))
data.iloc[:,:-1]=df

Shows the features of this dataset

In [11]:
print('The features of this dataset are: ')
print(data.iloc[:,:-1].columns.tolist())
print()
n_class=len(np.unique(data.iloc[:,-1].values))
print('The number of different types of forrest is: '+str(n_class))
print()
# initialize an empty list to store info
n_samples=[]
for i in range(n_class):
    df_sub=data[data['Cover_Type']==i+1]
    n_sample=df_sub.shape[0]
    n_samples.append(n_sample)
    
print('The number of samples in each category is: '+str(n_samples))

The features of this dataset are: 
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']

The number of different types of forrest is: 7

The number of sam

This dataset is highly skwed and thus we need to sample from each category to construct the new dataset for both training and testing purpose. We'll run the sampling policy 5 times and use the average score as the final result

In [20]:
d=[]
# in each category, we sample 2000 samples
num_samples=2000
for i in range(n_class):
    df_sub=data[data['Cover_Type']==i+1]
    # sample this subset
    df_sub=df_sub.sample(num_samples)
    # reset the index
    df_sub=df_sub.reset_index(drop=True,inplace=False)
    # append it into list
    d.append(df_sub)   

# concat sublist together
d_init=d[0]
for i in range(1,n_class):
    # update the initial dataset
    d_init=pd.concat(objs=[d_init,d[i]],axis=0)
    # reset index
    d_init=d_init.reset_index(drop=True,inplace=False)
    
# update the dataset
d=d_init
print('The sampling from original dataset is finished')

The sampling from original dataset is finished


The next thing is to seperate the dataset into training and test set. 80% of training set and 20% of test set

In [27]:
# split the dataset into input X and outcome Y
X=d.iloc[:,:-1]
Y=d.iloc[:,-1]
# apply one-hot encoding on Y
y=Y.values
Y=np.zeros((len(y),n_class))
for i in range(Y.shape[0]):
    ind=int(y[i])
    Y[i,ind-1]=1
    
# apply the train test split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,shuffle=True)

print('The shape of training set input is: '+str(X_train.shape))
print('The shape of training set outcome is: '+str(Y_train.shape))
print('The shape of test set input is: '+str(X_test.shape))
print('The shape of test set outcome is: '+str(Y_test.shape))

The shape of training set input is: (11200, 54)
The shape of training set outcome is: (11200, 7)
The shape of test set input is: (2800, 54)
The shape of test set outcome is: (2800, 7)


## Apply classification algorithm

In this section, we'll implement three different algorithms

1. KNN
2. SVM
3. Logistic regression

### KNN

In [52]:
# apply knn algorithm
from sklearn.neighbors import KNeighborsClassifier

# try with 10 different values of k to find the best one
Ks=10
mean_acc=np.zeros((Ks-1))
std_acc=np.zeros((Ks-1))
CM=[];
mean_acc=np.zeros((Ks,))
for n in range(1,Ks+1):
    #Train Model and Predict  
    neigh=KNeighborsClassifier(n_neighbors=n).fit(X_train,Y_train)
    y_hat=neigh.predict(X_test)
    # get the accuracy rate
    y_pred=np.argmax(y_hat,axis=1)
    y_true=np.argmax(Y_test,axis=1)
    acc=np.sum((y_true==y_pred)*1)/Y_test.shape[0]
    mean_acc[n-1]=acc
    # get the confusion matrix
    d=cm(y_true,y_pred)
    CM.append(d)

In [60]:
# test result 
print('The best number of neighbor is: '+str(np.argmax(mean_acc)+1))
print('And the correspond accuracy rate is: '+str(mean_acc[np.argmax(mean_acc)]))
print()
print('The best confusion matrxi is: ')
print(CM[np.argmax(mean_acc)])

The best number of neighbor is: 1
And the correspond accuracy rate is: 0.8167857142857143

The best confusion matrxi is: 
[[293  74   0   0  11   3  13]
 [ 87 260  11   0  39  10   0]
 [  0   6 291  35   7  78   0]
 [  0   0  19 413   0  14   0]
 [  5  19   4   0 355   2   0]
 [  0   3  42   4   5 315   0]
 [ 17   5   0   0   0   0 360]]


### SVM

In [76]:
# apply SVM algorithm
from sklearn import svm
clf=svm.LinearSVC(penalty='l1',dual=False)
clf.fit(X_train,np.argmax(Y_train,axis=1))
y_hat=clf.predict(X_test)

# get the accuracy rate
y_pred=y_hat
y_true=np.argmax(Y_test,axis=1)
acc=np.sum((y_true==y_pred)*1)/Y_test.shape[0]
# get the confusion matrix
d=cm(y_true,y_pred)

In [77]:
# test result
print('The test accuracy is: '+str(acc))
print()
print('The confusion matrix is: ')
print(d)

The test accuracy is: 0.6778571428571428

The confusion matrix is: 
[[248  64   0   0  34   4  44]
 [ 73 213  10   0  98  10   3]
 [  0   2 206  57  18 134   0]
 [  0   0  27 393   0  26   0]
 [ 12  55  27   0 274  17   0]
 [  0  15  60  26  35 233   0]
 [ 46   2   1   0   2   0 331]]


### Logistic regression

In [85]:
# apply logistic regression
from sklearn.linear_model import LogisticRegression
LogR=LogisticRegression(penalty='l2',C=0.01,solver='lbfgs',multi_class='multinomial').fit(X_train,np.argmax(Y_train,axis=1))
y_hat=LogR.predict(X_test)

# get the accuracy rate
y_pred=y_hat
y_true=np.argmax(Y_test,axis=1)
acc=np.sum((y_true==y_pred)*1)/Y_test.shape[0]
# get the confusion matrix
d=cm(y_true,y_pred)

In [86]:
# test result
print('The test accuracy is: '+str(acc))
print()
print('The confusion matrix is: ')
print(d)

The test accuracy is: 0.6075

The confusion matrix is: 
[[174 142   0   0  36   2  40]
 [ 53 248   6   2  84  10   4]
 [  0   2 153  89  28 145   0]
 [  0   0  24 381   0  41   0]
 [ 32  78  30   0 234  11   0]
 [  2  15  45  61  51 195   0]
 [ 32  31   1   0   2   0 316]]
