# Introduction to Data Science - Week 5 Tasks

In [None]:
#Data and plotting imports
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

#Machine learning imports
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
!git clone https://github.com/Louismac/NLP-Public
%cd NLP-Public

In [None]:
#https://hackernoon.com/how-to-plot-a-decision-boundary-for-machine-learning-algorithms-in-python-3o1n3w07
#A Function to plot decision boundarys
def plot_decision(X,y,model,n_classes=2):
    min1, max1 = X[:, 0].min()-1, X[:, 0].max()+1
    min2, max2 = X[:, 1].min()-1, X[:, 1].max()+1
    x1grid = np.arange(min1, max1, 0.1)
    x2grid = np.arange(min2, max2, 0.1)
    xx, yy = np.meshgrid(x1grid, x2grid)
    r1, r2 = xx.flatten(), yy.flatten()
    r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
    grid = np.hstack((r1,r2))
    model.fit(X, y)
    yhat = model.predict(grid)
    zz = yhat.reshape(xx.shape)
    plt.contourf(xx, yy, zz, cmap='binary_r')
    for class_value in range(n_classes):
        row_ix = np.where(y == class_value)
        plt.scatter(X[row_ix, 0], X[row_ix, 1], cmap='binary_r')

# KNN

## Experimenting

The code below allows you to generate some data classes and plot a decision boundary 

1. Try different values of **k**


2. Try moving around the ``centres`` and changing the ``standard deviation`` values, how does changing k work with different datasets?


3. Add some more classes so its not a binary classification problem. You can do this by adding more sets of coordinates to the ``centres`` array. e.g. 


```
centres = [
    #x and y for class 1
    [1,1],
    #x and y for class 2
    [3,3],
    #x and y for class 3
    [4,5]
]
```


In [None]:
#x and y coordinate for the centre of each class
centres = [
    #x and y for class 1
    [1,1],
    #x and y for class 2
    [3,3]
]
#Controls the variation (deviation from the centre)
standard_deviation = 1.4
#number of nearest neighbours used for matching 
k = 5

In [None]:
#Generate data
x, y = make_blobs(n_samples=500, centers=centres, random_state=1, cluster_std=standard_deviation)
#Plot decision boundary 
plt.figure(figsize=(12,8))
plot_decision(x, y, KNeighborsClassifier(n_neighbors=k), len(centres))

# Decision Trees 

## Experimenting

The code below allows you to generate some data classes and plot a decision boundary. Compare behvaiour against the **KNN**

1. Try different values of **max_depth**


2. Try moving around the ``centres`` and changing the ``standard deviation`` values, how does changing max_depth work with different datasets?


3. Add some more classes so its not a binary classification problem. You can do this by adding more sets of coordinates to the ``centres`` array.  

In [None]:
#x and y coordinate for the centre of each class
centres = [
    #x and y for class 1
    [1,1],
    #x and y for class 2
    [3,3]
]
#Controls the variation (deviation from the centre)
standard_deviation = 1.4
#number of nearest neighbours used for matching 
max_depth = 2

In [None]:
#Generate data
x, y = make_blobs(n_samples=500, centers=centres, random_state=1, cluster_std=standard_deviation)
#Plot decision boundary 
plt.figure(figsize=(12,8))
plot_decision(x, y, DecisionTreeClassifier(max_depth=5), len(centres))

# Wine Dataset 

Now we're going to load in the wine dataset (from https://www.kaggle.com/rajyellow46/wine-quality).


In [None]:
df = pd.read_csv("data/winequalityN.csv")
#Drop missing rows
df = df.dropna()
#Replace wines with "type = white" with 0
df.loc[(df["type"]=="white"), 'type'] = 0
#Replace wines with "type = red" with 1
df.loc[(df["type"]=="red"), 'type'] = 1
#Sort and get the first 1500 white wines and last 1500 red wines (balances classes)
w = df.sort_values("type")[0:1500]
r = df.sort_values("type")[-1500:]
df = pd.concat([w,r])
df.columns

### Try different features 

1. Try combinations of two features from the 12 above, which is the most accurate? How does that effect the decision boundary?


2. Try different max_depths

In [None]:
#Pick two features
feature1 = "chlorides"
feature2 = "fixed acidity"
#Pick max_depth
max_depth = 3

In [None]:
#Train model, plot decision boundary 
x = df[[feature1,feature2]].values
y = pd.to_numeric(df["type"])
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0)
model = DecisionTreeClassifier(max_depth=max_depth)
model.fit(x_train,y_train)
#See if the model works
y_pred = model.predict(x_test)
num_incorrect = (y_test != y_pred).sum()
total = y_test.shape[0]
acc = (total - num_incorrect) / total * 100
print("Accuracy:", acc)

In [None]:
plt.figure(figsize=(12,8))
#Set bounds to min and max values of features 
plt.xlim([np.min(x[:,0]),np.max(x[:,0])])
plt.ylim([np.min(x[:,1]),np.max(x[:,1])])
plot_decision(x, y, model, 2)

# Inspecting the tree

See what the decision tree for your best feature pair looks like

1. Look through the choices the tree makes. You may have to alter the `figsize` and `fontsize` for deeper trees.


2. Look at splits the tree decided to make, can you see how it has balanced both large splits and low gini impurity scores?

In [None]:
#Pick max_depth
max_depth = 3

In [None]:
plt.figure(figsize=(15,8))
my_tree = plot_tree(model, feature_names=[feature1, feature2],fontsize=11,class_names = ["white","red"]) 

## Using more than 2 features 

Try using the whole feature set, inspect the tree and see how it favours some features over others 


In [None]:
#Pick max_depth
max_depth = 3

In [None]:
#Pick all features for input
x_labels = list(df.columns.values)
x_labels.remove("type")
x = df[x_labels]
#Fit model
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0)
model = DecisionTreeClassifier(max_depth=max_depth)
model.fit(x_train,y_train)
#See if the model works
y_pred = model.predict(x_test)
num_incorrect = (y_test != y_pred).sum()
total = y_test.shape[0]
acc = (total - num_incorrect) / total * 100
print("Accuracy:", acc)
plt.figure(figsize=(15,8))
my_plot = plot_tree(model, feature_names=x_labels,fontsize=11,class_names = ["white","red"]) 

# Try with your own data!

1. Pick some data you have found, or find some more! A place you might want to try is [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php) or [Kaggle](https://www.kaggle.com/datasets)


2. Pick at least one categorical variable to be the class (`y`). 

    * You can make a continuous variable categorical using a threshold like below.  For example, here we make a new column called **college** which is **0 if less than 10 years in education** and **1 if 10 or more years in education**
    
    ``
    split = 10
    df.loc[(df["education.num"]<split), 'college'] = 0
    df.loc[(df["education.num"]>=split), 'college'] = 1
    ``


3. Try some different models (you can even try and [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html))