# Machine Learning Exercise 1 - Classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

## Mushroom Edibility

In [2]:
mushrooms = pd.read_csv('./mushrooms/mushrooms.csv')
# encode labels
mushrooms = mushrooms.apply(LabelEncoder().fit_transform)

mushrooms_X = mushrooms.drop('edibility', axis=1)
mushrooms_y = mushrooms['edibility']

mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test = train_test_split(mushrooms_X, mushrooms_y)

In [3]:
# model 1: KNN
mushrooms_KNN_model = KNeighborsClassifier(n_neighbors=5)
mushrooms_KNN_model.fit(mushrooms_X_train, mushrooms_y_train)
mushrooms_KNN_prediction = mushrooms_KNN_model.predict(mushrooms_X_test)
mushrooms_KNN_accuracy = accuracy_score(mushrooms_KNN_prediction, mushrooms_y_test)
print('KNN accuracy:', mushrooms_KNN_accuracy)

KNN accuracy: 1.0


In [4]:
# model 2: Decision Tree
mushrooms_tree_model = DecisionTreeClassifier(random_state=0)
mushrooms_tree_model.fit(mushrooms_X_train, mushrooms_y_train)
mushrooms_tree_prediction = mushrooms_tree_model.predict(mushrooms_X_test)
mushrooms_tree_accuracy = accuracy_score(mushrooms_tree_prediction, mushrooms_y_test)
print('DecisionTree accuracy:', mushrooms_tree_accuracy)

DecisionTree accuracy: 1.0


In [5]:
# model 3: Multi-Layer Perceptron
mushrooms_mlp_model = MLPClassifier(random_state=0)
mushrooms_mlp_model.fit(mushrooms_X_train, mushrooms_y_train)
mushrooms_mlp_prediction = mushrooms_mlp_model.predict(mushrooms_X_test)
mushrooms_mlp_accuracy = accuracy_score(mushrooms_mlp_prediction, mushrooms_y_test)
print('MultiLayerPerceptron accuracy:', mushrooms_mlp_accuracy)

MultiLayerPerceptron accuracy: 1.0


## Soybeans

In [6]:
soybeans = pd.read_csv('./soybeans/soybean.csv')

# encode labels
soybeans = soybeans.apply(LabelEncoder().fit_transform)

# handle missing values by dropping, see pdf for more info
#soybeans[soybeans.precip != "?"]
#soybeans.precip == "?"

soybeans_X = soybeans.drop('class', axis=1)
soybeans_y = soybeans['class']

soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test = train_test_split(soybeans_X, soybeans_y)

In [7]:
# model 1: KNN
soybeans_KNN_model = KNeighborsClassifier(n_neighbors=5)
soybeans_KNN_model.fit(soybeans_X_train, soybeans_y_train)
soybeans_KNN_prediction = soybeans_KNN_model.predict(soybeans_X_test)
soybeans_KNN_accuracy = accuracy_score(soybeans_KNN_prediction, soybeans_y_test)
print('KNN accuracy:', soybeans_KNN_accuracy)

KNN accuracy: 0.8187134502923976


In [8]:
# model 2: Decision Tree
soybeans_tree_model = DecisionTreeClassifier()
soybeans_tree_model.fit(soybeans_X_train, soybeans_y_train)
soybeans_tree_prediction = soybeans_tree_model.predict(soybeans_X_test)
soybeans_tree_accuracy = accuracy_score(soybeans_tree_prediction, soybeans_y_test)
print('DecisionTree accuracy:', soybeans_tree_accuracy)

DecisionTree accuracy: 0.9064327485380117


In [9]:
# model 3: Multi-Layer Perceptron
soybeans_mlp_model = MLPClassifier()
soybeans_mlp_model.fit(soybeans_X_train, soybeans_y_train)
soybeans_mlp_prediction = soybeans_mlp_model.predict(soybeans_X_test)
soybeans_mlp_accuracy = accuracy_score(soybeans_mlp_prediction, soybeans_y_test)
print('MultiLayerPerceptron accuracy:', soybeans_mlp_accuracy)

MultiLayerPerceptron accuracy: 0.9181286549707602




## Breast Cancer Data

In [10]:
breastcancer_train = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancer_sol_input = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.tes.csv')


breastcancer_train = breastcancer_train.drop('ID', axis=1)
breastcancer_sol_input = breastcancer_sol_input.drop('ID', axis=1)

breastcancer_X = breastcancer_train.drop('class', axis=1)
breastcancer_y = breastcancer_train['class']

breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test  = train_test_split(breastcancer_X, breastcancer_y)

In [11]:
# model 1: KNN
breastcancer_KNN_model = KNeighborsClassifier(n_neighbors=5)
breastcancer_KNN_model.fit(breastcancer_X_train, breastcancer_y_train)
breastcancer_KNN_prediction = breastcancer_KNN_model.predict(breastcancer_X_test)
breastcancer_KNN_accuracy = accuracy_score(breastcancer_KNN_prediction, breastcancer_y_test)
print('KNN accuracy:', breastcancer_KNN_accuracy)

KNN accuracy: 0.9444444444444444


In [12]:
# model 2: Decision Tree
breastcancer_tree_model = DecisionTreeClassifier(random_state=0)
breastcancer_tree_model.fit(breastcancer_X_train, breastcancer_y_train)
breastcancer_tree_prediction = breastcancer_tree_model.predict(breastcancer_X_test)
breastcancer_tree_accuracy = accuracy_score(breastcancer_tree_prediction, breastcancer_y_test)
print('DecisionTree accuracy:', breastcancer_tree_accuracy)

DecisionTree accuracy: 0.9166666666666666


In [13]:
# model 3: Multi-Layer Perceptron
breastcancer_mlp_model = MLPClassifier(random_state=0)
breastcancer_mlp_model.fit(breastcancer_X_train, breastcancer_y_train)
breastcancer_mlp_prediction = breastcancer_mlp_model.predict(breastcancer_X_test)
breastcancer_mlp_accuracy = accuracy_score(breastcancer_mlp_prediction, breastcancer_y_test)
print('MultiLayerPerceptron accuracy:', breastcancer_mlp_accuracy)

MultiLayerPerceptron accuracy: 0.9305555555555556


In [14]:
# use the solution input
breastcancer_KNN_prediction = breastcancer_KNN_model.predict(breastcancer_sol_input)
#breastcancer_KNN_accuracy = accuracy_score(breastcancer_KNN_prediction, breastcancer_y_test)
print('Solution prediction:', breastcancer_KNN_prediction)

Solution prediction: [ True False False False False  True  True False False False False False
  True  True  True False False False False False  True  True False False
 False False False False False  True False False  True  True False False
  True False False  True False False  True False  True  True False False
 False  True  True False False False  True  True False False  True False
 False  True  True False False False False  True False False False  True
 False  True  True  True False False  True False False  True False  True
  True False False False False False  True False False False False False
 False False  True False  True False False  True False  True  True False
 False False False False  True False False  True False False False False
 False False  True False False False  True  True  True False False  True
  True False False False  True False  True False  True  True  True False
 False  True False False False False False  True  True False  True False
 False  True  True False  True

## Purchase Data

In [15]:
purchase_train = pd.read_csv('./purchase/purchase600-100cls-15k.lrn.csv')
purchase_sol_input = pd.read_csv('./purchase/purchase600-100cls-15k.tes.csv')

# labels do not need to be encoded, inputs are numeric
#purchase_train = purchase_train.apply(LabelEncoder().fit_transform)

purchase_train = purchase_train.drop('ID', axis=1)
purchase_sol_input = purchase_sol_input.drop('ID', axis=1)

purchase_X = purchase_train.drop('class', axis=1)
purchase_y = purchase_train['class']

purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test  = train_test_split(purchase_X, purchase_y)

In [16]:
# model 1: KNN
purchase_KNN_model = KNeighborsClassifier(n_neighbors=5)
purchase_KNN_model.fit(purchase_X_train, purchase_y_train)
purchase_KNN_prediction = purchase_KNN_model.predict(purchase_X_test)
purchase_KNN_accuracy = accuracy_score(purchase_KNN_prediction, purchase_y_test)
print('KNN accuracy:', purchase_KNN_accuracy)

KNN accuracy: 0.2512


In [17]:
# model 2: Decision Tree
purchase_tree_model = DecisionTreeClassifier(random_state=0)
purchase_tree_model.fit(purchase_X_train, purchase_y_train)
purchase_tree_prediction = purchase_tree_model.predict(purchase_X_test)
purchase_tree_accuracy = accuracy_score(purchase_tree_prediction, purchase_y_test)
print('DecisionTree accuracy:', purchase_tree_accuracy)

DecisionTree accuracy: 0.078


In [18]:
# model 3: Multi-Layer Perceptron
purchase_mlp_model = MLPClassifier(random_state=0)
purchase_mlp_model.fit(purchase_X_train, purchase_y_train)
purchase_mlp_prediction = purchase_mlp_model.predict(purchase_X_test)
purchase_mlp_accuracy = accuracy_score(purchase_mlp_prediction, purchase_y_test)
print('MultiLayerPerceptron accuracy:', purchase_mlp_accuracy)

MultiLayerPerceptron accuracy: 0.658


In [19]:
# use the solution input
purchase_KNN_prediction = purchase_KNN_model.predict(purchase_sol_input)
#breastcancer_KNN_accuracy = accuracy_score(breastcancer_KNN_prediction, breastcancer_y_test)
print('Solution prediction:', purchase_KNN_prediction)

Solution prediction: [82 62 36 ... 77 54 81]
