# MATH2319/ MATH2387 Machine Learning
## Take-Home Assessment
### Galen Ralph Herten-Crabb 3955778 
### Question 1

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import io
import requests
import math
import statistics


pd.set_option('display.max_columns', None) 

### Part A
#### Prepare dataset for KNN modeling
#### Step 1: One Hot Encoding

After loading the data the test observation is inserted at the bottom of the data frame to encode and scale with the rest of the data, it will be removed again later.

The code that is run is designed to change the categorical values in the dataset into numerical values, accross an array where each value has it's own column and records are distinguished by either a 1 or a 0 where relevant.

Once transformed in this way the old columns of categorical values are dropped and the new columns are labeled and then rearranged to be easier to manage.  

In [2]:
df_THA = pd.read_csv('THA_diamonds.csv')

In [3]:
my_observaton = pd.Series(['Good', 'D', 60, 'premium', ''], index=df_THA.columns)

In [4]:
df_THA = df_THA.append(my_observaton,ignore_index=True)


In [5]:
from sklearn.preprocessing import OneHotEncoder #Perform one-hot encoding of the categorical descriptive features in the input dataset.

encoder = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder.fit_transform(df_THA[['cut', "color", "price"]]).toarray())
one_hot_df = df_THA.join(encoder_df)
one_hot_df = one_hot_df.drop(["cut", "color", "price"], axis = 1)
one_hot_df.columns = ["depth", "carat", "cut_fair", "cut_good", "color_D", "color_F", "color_I", "price_high", "price_low", "price_medium", "price_premium"]
one_hot_df = one_hot_df[["depth", "cut_fair", "cut_good", "color_D", "color_F", "color_I", "price_high", "price_low", "price_medium", "price_premium", "carat"]]


#### Step 2: scale descriptive features to be between 0 and 1

In this step, now that all the descriptive features are numeric, the code scales the values down to numbers between 0 and 1 using the MinMaxScaler. This has little effect on the categorical values that are already 1s and 0s but it makes the 'Depth' column more manageable. Obviously the 'Carat' column remains untouched.

In [6]:
from sklearn import preprocessing 

x = one_hot_df.iloc[:, 0:10].values 
min_max_scaler = preprocessing.MinMaxScaler(feature_range =(0, 1)) 
one_hot_df[one_hot_df.columns[0:9]] = min_max_scaler.fit_transform(one_hot_df[one_hot_df.columns[0:9]])
df = one_hot_df

#### Step 3: Display the last 10 rows after one-hot encoding and scaling.

In [7]:
df.tail(10).style.set_precision(3)

Unnamed: 0,depth,cut_fair,cut_good,color_D,color_F,color_I,price_high,price_low,price_medium,price_premium,carat
203,0.201,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.91
204,0.776,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.9
205,0.321,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.92
206,0.858,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.91
207,0.627,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.96
208,0.164,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9
209,0.701,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.9
210,0.216,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.93
211,0.269,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.9
212,0.351,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,


### Part B
#### Solution

A note on the strategy employed. This solution has three major steps
1. Calculate the Eucidian distances between all data points in the dataset and the test observation
2. Find the nearest k neighbor by sorting these distances, in this case in a data frame, from nearest to farthest.
3. Calculate the mean of k neighbors to predict target ('Carat')

Firstly the target row is extracted from the dataset and saved as a pandas data frame, that row is then dropped from the training frame.


In [8]:
target = df.iloc[212] #save target row 
test = pd.DataFrame(target[['depth', 'cut_fair', 'cut_good', 'color_D', 'color_F', 'color_I', 'price_high', 'price_low', 'price_medium', 'price_premium']])

In [9]:
df_train = df.drop(212)#drop target row from df

A function is created to calculate the Euclidian distance from any variable, or point, to our target (now labeled 'test'). This fuction is designed to apply the Euclidian distance formula to each value in a given subset.

In [10]:
def euc_distance(x):   # calculating euclidian distance   
    a = x.to_numpy()
    b = test.to_numpy()    
    distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(a, b[0])]))
    return distance

Here the above function is applied to each record and stored in the data frame as a new column labeled 'distance', this completes the first step in the strategy.

The dataframe is then sorted by ascending distance and the index is reset for ease of row location. This completes the second step, the data frame will be displayed below as 'df_distances'

In [11]:
df_train['distance'] = df_train[['depth', 'cut_fair', 'cut_good', 'color_D', 'color_F', 'color_I', 'price_high', 'price_low', 'price_medium', 'price_premium']].apply(euc_distance, axis=1)
df_sorted = df_train.sort_values('distance',ascending=True)
df_distances = df_sorted.reset_index()
df_distances.style.set_precision(3)


Unnamed: 0,index,depth,cut_fair,cut_good,color_D,color_F,color_I,price_high,price_low,price_medium,price_premium,carat,distance
0,74,0.351,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.57,0.0
1,105,0.358,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.72,0.007
2,28,0.358,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.51,0.007
3,198,0.343,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.7,0.007
4,135,0.358,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.7,0.007
5,164,0.373,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.77,0.022
6,22,0.328,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.5,0.022
7,79,0.321,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.32,0.03
8,2,0.381,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.5,0.03
9,205,0.321,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.92,0.03


Now the prediction can be extracted by calculating the mean of k neighbours, which in this case is the mean of values contained within however many coloumns are determined. k = the number of columns ascending from 0, each column in this order representing a progressively further neighbour. 

To answer k=1 the code below is used (no mean is calculated at this stage as it is a single number, but for higher values of k the mean must be calculated for the prediction to be accurate)

In [12]:
from statistics import mean
k1 = (df_distances["carat"].iloc[0])
k1

0.57

### Part C

Now that k = 5 the mean is calculated

In [13]:
k5 = statistics.mean(df_distances["carat"].iloc[0:4])
k5

0.625

### Part D

k = 10

In [14]:
k10 = statistics.mean(df_distances["carat"].iloc[0:9])
k10

0.5877777777777777

### Part E

To compare the manual calculations above to the library knn regressor tool found in scikit-learn it most be first imported and the data prepared into two groups, namely train_x (the descriptive features) and train_y (the target feature). Then the test observation is converted into a numpy array and reshaped to match the training data to facilitate fitting the regressor function.
The regressor function is defined for each prediction with the value of k and the value p (p=2 referencing the euclidiean distance). The training data is fitted to the regressor and the predict function called to provide the prediction. The results for each value of k are below. 

In [15]:
from sklearn.neighbors import KNeighborsRegressor
train_x = df_train[['depth', 'cut_fair', 'cut_good', 'color_D', 'color_F', 'color_I', 'price_high', 'price_low', 'price_medium', 'price_premium']]
train_y = df_train[['carat']]


In [16]:
point = np.array(test)
test_point = point.reshape(1,-1)

#### Result for k = 1

In [17]:
KNR1 = KNeighborsRegressor(n_neighbors = 1, p = 2)
KNR1.fit(train_x, train_y)
KNR1.predict(test_point)

array([[0.7]], dtype=object)

#### Result for k=5

In [18]:
KNR5 = KNeighborsRegressor(n_neighbors = 5, p = 2)
KNR5.fit(train_x, train_y)
KNR5.predict(test_point)

array([[0.6940000000000001]], dtype=object)

#### Result for k=10

In [19]:
KNR10 = KNeighborsRegressor(n_neighbors = 10, p = 2)
KNR10.fit(train_x, train_y)
KNR10.predict(test_point)

array([[0.666]], dtype=object)

There is a descrepancy between the two methods, they did not produce the same results. The KNeighborsRegressor() was more accurate than the manual calculations performed, although at k=5 they were very close. The only reason this could be the case is if the manual calculations are wrong or perhaps a stage in the data prep e.g. the ordering process was different to how the KNNregressor function determines or records distance, by default the KNNregressor utilises minkowski distance although the code clearly requests that Euclidian is used, perhaps it was entered incorrectly.  

### Part F: Wrap-up

In [20]:
data = {'method': ['KNN1', 'KNN5', 'KNN10'],
        'prediction': [0.57, 0.625, 0.587],
        'is_best': ['False', 'False', 'False']}
df_summary_manual = pd.DataFrame(data)

df_summary_manual

Unnamed: 0,method,prediction,is_best
0,KNN1,0.57,False
1,KNN5,0.625,False
2,KNN10,0.587,False


In [21]:
sk_data = {'method': ['KNN1', 'KNN5', 'KNN10'],
        'prediction': [0.700, 0.694, 0.666],
        'is_best': ['True', 'True', 'True']}
df_summary_sklearn = pd.DataFrame(sk_data)

df_summary_sklearn

Unnamed: 0,method,prediction,is_best
0,KNN1,0.7,True
1,KNN5,0.694,True
2,KNN10,0.666,True
