In [1]:
from decision_trees import DecisionTree
from random_forest import RandomForest
from svm import SupportVectorMachines
from knn import KNearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import time

In [2]:
df = pd.read_csv('newhousing.txt', delimiter=" ")
df_newhousing = pd.read_csv('newhousing.txt', delimiter=" ")
df_spam = pd.read_csv('spam.txt', delimiter=" ")

In [3]:
X, y = df.loc[:, df.columns != 'pricelevel'], df.loc[:, df.columns == 'pricelevel']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=400/df.shape[0], random_state=1, shuffle = True, 
                                                    stratify = y)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train).reshape(len(y_train,)), np.array(y_test).reshape(len(y_test,))

### Decision Trees

In [33]:
dt = DecisionTree(X_train, y_train)

Fitting 5 folds for each of 4788 candidates, totalling 23940 fits


In [34]:
dt.best_params

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_leaf': 4,
 'min_samples_split': 14,
 'random_state': 0}

In [35]:
dt.model

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=4,
                       min_samples_split=14, random_state=0)

In [36]:
dt.get_accuracy(X_test, y_test)

0.7830188679245284

### Random Forest

In [4]:
rf = RandomForest(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [5]:
rf.best_params

{'criterion': 'gini', 'n_estimators': 100, 'random_state': 0}

In [6]:
rf.model

RandomForestClassifier(random_state=0)

In [7]:
rf.get_accuracy(X_test, y_test)

0.8301886792452831

### SVM

In [3]:
X, y = df_spam.loc[:, df_spam.columns != 'type'], df_spam.loc[:, df_spam.columns == 'type']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=500/df_spam.shape[0], random_state=1, shuffle = True, 
                                                    stratify = y)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train).reshape(len(y_train,)), np.array(y_test).reshape(len(y_test,))

In [4]:
svm = SupportVectorMachines(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [5]:
svm.best_params

{'C': 1, 'kernel': 'linear', 'random_state': 0}

In [6]:
svm.model

SVC(C=1, kernel='linear', random_state=0)

In [7]:
svm.get_accuracy(X_test, y_test)

0.9083150451109485

### KNN

In [9]:
X, y = df_newhousing.loc[:, df_newhousing.columns != 'pricelevel'], df_newhousing.loc[:, df_newhousing.columns == 'pricelevel']
# scaling explanatory variables
scaler = StandardScaler()
X = scaler.fit_transform(X)
# splitting dataset into train- and testdata
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=400/df_newhousing.shape[0], random_state=0, shuffle = True, 
                                                    stratify = y)
# reshaping such that it matches the requirements of sklearn
X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train).reshape(len(y_train,)), np.array(y_test).reshape(len(y_test,))

In [10]:
knn = KNearestNeighbors(X_train, y_train)

Fitting 10 folds for each of 4263 candidates, totalling 42630 fits


In [11]:
knn.best_params

{'leaf_size': 1, 'n_neighbors': 8, 'p': 2}

In [12]:
knn.model

KNeighborsClassifier(leaf_size=1, n_neighbors=8)

In [13]:
knn.get_accuracy(X_test, y_test)

0.7735849056603774