# T-Shirt Sizing Classification

Suppose we have the height, weight and T-shirt size's of our customers. By using the K-Nearest Neighbours, let's see if we can predict the T-shirt size of a new customer, given only their height and weight.

Data Source: https://www.listendata.com/2017/12/k-nearest-neighbor-step-by-step-tutorial.html

# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [None]:
ds = pd.read_csv('Data.csv')

In [None]:
ds.head()

In [None]:
len(ds)

# Taking care of missing data

In [None]:
# We observe no missing data

sns.heatmap(ds.isnull(), yticklabels = False, cbar = False, cmap = 'Blues')

In [None]:
X = ds.iloc[:, :-1].values
y = ds.iloc[:, -1].values

# Visualising the dataset

In [None]:
sns.pairplot(data = ds, hue = 'T Shirt Size', vars = ['Height (in cms)', 'Weight (in kgs)'])

In [None]:
sns.scatterplot(x ='Height (in cms)', y='Weight (in kgs)',hue = 'T Shirt Size', data = ds)
plt.show()

# Encoding Categorical Variables

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# We observe 'Small' being assigned the number 1 and 'Large' being assigned the number 0.

y

# Splitting the dataset into the training set and test set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting the K-Nearest Neighbours to the dataset

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski')
knn.fit(X_train, y_train)

In [None]:
# Predicting the test set values

y_pred = knn.predict(X_test)

# Model Evaluation - Confusion Matrix and K-Fold Cross validation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 2)
mean_accuracy = accuracies.mean()
std_accuracy = accuracies.std()

In [None]:
print(mean_accuracy)
print(std_accuracy)

# Visualising the Training Set

In [None]:
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, 
             cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), 
                label = j)
plt.title('Training dataset')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend()
plt.show()

# Visualising the Test Set

In [None]:
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, 
             cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), 
                label = j)
plt.title('Test dataset')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend()
plt.show()