## Classifiers by Scikit-Learn

### Data Preparation 

In [1]:
import csv
import random
import pandas as pd
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
def loadDataset(split, trainingSet=[], testSet=[]):
    with open("Dataset.csv", "r") as f:
        reader = csv.reader(f)
        dataset = [row for row in reader]
        dataset.pop(0)
    
        for x in range(len(dataset)-1):
            for y in range(9):
                dataset[x][y] = float(dataset[x][y])
                if random.random() < split:
                    trainingSet.append(dataset[x])
                else:
                    testSet.append(dataset[x])
    
    return trainingSet, testSet        

In [3]:
trainingSet = []
testSet = []
loadDataset(0.67, trainingSet, testSet)
pd.DataFrame(testSet).head()

x_train = []
for i in trainingSet:
    x_temp = i[:-1]
    x_train.append(x_temp)
    
y_train = []
for i in trainingSet:
    y_temp = i[-1]
    y_train.append(y_temp)
    
x_test = []
for i in testSet:
    x_temp = i[:-1]
    x_test.append(x_temp)
    
y_test = []
for i in testSet:
    y_temp = i[-1]
    y_test.append(y_temp)

### Decision Tree Classifiers

In [4]:
def dt_cls():
    clf = tree.DecisionTreeClassifier()
    data_clf = clf.fit(x_train, y_train)

    y_test_predictions = data_clf.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_test_predictions)
    
    print('Accuracy: '+ '{0:.2f}'.format(accuracy * 100) + '% ')

In [5]:
dt_cls()

Accuracy: 95.80% 


### Random Forest Classifier

In [6]:
def rf_cls():
    forest = ensemble.RandomForestClassifier(n_estimators = 100)
    forest_fit = forest.fit(x_train, y_train)

    y_test_predicted = forest.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_test_predicted)
    print('Accuracy: '+ '{0:.2f}'.format(accuracy * 100) + '% ')

In [7]:
rf_cls()

Accuracy: 95.83% 


### Ada Boost Classifier

In [8]:
def adaBoost_cls():
    clf = ensemble.AdaBoostClassifier(n_estimators = 100)
    boost_fit = clf.fit(x_train, y_train)
    
    y_test_predicted = boost_fit.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_test_predicted)
    print('Accuracy: '+ '{0:.2f}'.format(accuracy * 100) + '% ')

In [9]:
adaBoost_cls()

Accuracy: 85.21% 


### Bagging Classifier

In [10]:
def bag_cls():
    bag = ensemble.BaggingClassifier(n_estimators = 100)
    bag_fit = bag.fit(x_train, y_train)
    
    y_test_predicted = bag_fit.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_test_predicted)
    print('Accuracy: '+ '{0:.2f}'.format(accuracy * 100) + '% ')

In [11]:
bag_cls()

Accuracy: 95.80% 


### Linear Discriminant Analysis

In [12]:
def linearDisc_cls():
    lda = LDA()
    lda_fit = lda.fit(x_train, y_train)
    
    y_test_predicted = lda_fit.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_test_predicted)
    print('Accuracy: '+ '{0:.2f}'.format(accuracy * 100) + '% ')

In [13]:
linearDisc_cls()

Accuracy: 82.83% 
