# Machine Learning
## Assignment 1
### Morgan Reilly -- 20235398

## References:
* https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#tutorial-setup
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html
* https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics

## Imports

In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
import csv

from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn import tree

## Text Processing

* Create csv file from text file
* Read in text file and transpose to csv
* Do this for both test and train data

#### CSV Generation

In [2]:
"""
Read CSV
    Load csv from disk, pass in output location
    Convert to CSV and return
"""
def read_csv(file_in, file_out):
    data = pd.read_csv(file_in, sep="\t", header=None)
    data.columns = ['calorific_value', 'nitrogen', 'turbidity',
                        'style', 'alcohol', 'sugars', 'bitterness',
                        'beer_id', 'colour', 'degree_of_fermentation']
    data.to_csv(file_out, index = None)
    data = pd.read_csv(file_out)
    return data

#### Training Data

In [3]:
# Read in and store training data as datafram
training_data = read_csv("data/beer_training.txt", "data/beer_training.csv")
training_data.describe()

Unnamed: 0,calorific_value,nitrogen,turbidity,alcohol,sugars,bitterness,beer_id,colour,degree_of_fermentation
count,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,41.638096,0.322277,1.858922,3.998486,17.365645,8.935387,90.032258,11.321226,67.143733
std,2.623812,0.135035,0.891977,0.2505,1.113639,3.419934,50.83337,2.842828,5.862946
min,37.075221,0.107013,0.290909,3.393846,15.74,2.730211,1.0,5.76,53.668571
25%,38.966814,0.215891,1.192727,3.801538,16.56,6.140829,44.5,9.0,63.336071
50%,41.809735,0.29015,1.72,4.015385,16.9,9.436842,94.5,11.52,66.663571
75%,44.022124,0.405223,2.373409,4.219231,18.15,11.476408,133.5,13.23,70.733929
max,45.836283,0.742774,4.255455,4.427692,20.65,20.063789,177.0,20.52,87.238571


#### Test Data

In [4]:
# Read in and store test data as data-frame
test_data = read_csv("data/beer_test.txt", "data/beer_test.csv")
test_data.describe()

Unnamed: 0,calorific_value,nitrogen,turbidity,alcohol,sugars,bitterness,beer_id,colour,degree_of_fermentation
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,41.722714,0.315067,1.993182,4.037641,17.213,8.463911,91.733333,11.408,61.087095
std,2.616999,0.106278,1.063155,0.246711,1.140551,3.035596,51.831247,2.599807,11.134063
min,37.29646,0.156837,0.689091,3.526154,15.9,2.924895,4.0,6.84,37.034286
25%,39.973451,0.237648,1.345227,3.879231,16.6475,6.341276,54.5,9.39,53.237143
50%,41.522124,0.271882,1.771818,4.033846,16.73,7.855421,87.0,11.76,59.157143
75%,43.977876,0.395544,2.2525,4.173846,17.7525,10.727566,131.75,13.35,73.33
max,45.880531,0.525512,5.217273,4.563077,20.51,13.456368,178.0,15.36,79.134286


In [5]:
training_data.count()

calorific_value           124
nitrogen                  124
turbidity                 124
style                     124
alcohol                   124
sugars                    124
bitterness                124
beer_id                   124
colour                    124
degree_of_fermentation    124
dtype: int64

In [6]:
test_data.count()

calorific_value           30
nitrogen                  30
turbidity                 30
style                     30
alcohol                   30
sugars                    30
bitterness                30
beer_id                   30
colour                    30
degree_of_fermentation    30
dtype: int64

## Classification

In [7]:
"""
Generate Samples
    Takes a pandas data frame
    Strips the labels
    Returns list
"""
def gen_samples(df):
    n_samples = []
    for col, row in df.iterrows():
        rows = [row[0], row[1], row[2], row[4], row[5], 
            row[6], row[7], row[8], row[9]]
        n_samples.append(rows)
    return n_samples

"""
Feature Generation
    Takes pandas data frame
    Isolates clasifing label (in this case: style)
    returns list
"""
def gen_features(df):
    n_features = []
    for col in df['style']:
        n_features.append(col)
    return n_features

### Generate features set and sample set of:
    * Training Data
    * Test Data

In [8]:
# Training Data
n_train_samples = gen_samples(training_data)
n_train_features = gen_features(training_data)

# Test Data
n_test_samples = gen_samples(test_data)
n_test_features = gen_features(test_data)

#### Set X, y
    X: Samples
    y: Features

In [9]:
# Training Set
X_train = n_train_samples
y_train = n_train_features

# Testing Set
X_test = n_test_samples
y_test = n_test_features

### Model - Training & Testing

* Here I fit the model to a decision tree classifier
* I train the model on the training data, predicit the output and then evaluate the score

In [10]:
# Fit model to training data
model = tree.DecisionTreeClassifier()
model = model.fit(X_train, y_train)

In [11]:
# Generate prediction score from training data 
train_predictions = model.predict(X_train)
# print(train_predictions)
train_score = model.score(X_train, y_train)
print(f"Training Score: {train_score}%")

Training Score: 1.0%


In [12]:
# Generate prediction score from testing data
test_predictions = model.predict(X_test)
# print(test_predictions)
test_score = model.score(X_test, y_test)
print(f"Test Score: {test_score}%")

Test Score: 0.8333333333333334%


In [44]:
# Accuracy Score
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_train[:30], y_test)
print(f"Accuracy: {acc}%")

Accuracy: 0.3333333333333333%


In [43]:
# Cross Validation - Train
cross_validate(model, X_train, y_train)

{'fit_time': array([0.00066519, 0.00053811, 0.00051332, 0.0005393 , 0.0005374 ]),
 'score_time': array([0.00025368, 0.00023007, 0.00023079, 0.00023198, 0.00023174]),
 'test_score': array([1.  , 0.92, 0.8 , 0.92, 0.75])}

In [41]:
# Cross Validation - Test
cross_validate(model, X_test, y_test)

{'fit_time': array([0.0017004 , 0.00038791, 0.00028849, 0.00026083, 0.00026608]),
 'score_time': array([0.00101233, 0.00020933, 0.00020099, 0.00026131, 0.00030065]),
 'test_score': array([0.83333333, 1.        , 0.83333333, 0.83333333, 0.83333333])}

In [None]:
tree.plot_tree(model)