In [2]:
# Importing the necessary packages
import numpy as np                                  # "Scientific computing"


import pandas as pd                                 # Data Frame

import matplotlib.pyplot as plt                     # Basic visualisation

from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Read the dataset
df = pd.read_csv('https://raw.githubusercontent.com/HOGENT-ML/course/main/datasets//clothes_size_prediction.csv')
df.head()

Unnamed: 0,weight,age,height,size
0,62,28.0,172.72,XL
1,59,36.0,167.64,L
2,61,34.0,165.1,M
3,65,27.0,175.26,L
4,62,45.0,172.72,M


## Take a look at the dataset

We'll try to predict the size based on the weight, age and height.   
  
Show some general info about the dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119734 entries, 0 to 119733
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   weight  119734 non-null  int64  
 1   age     119477 non-null  float64
 2   height  119404 non-null  float64
 3   size    119734 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.7+ MB


What are number of records for each size?  

M: 29575  
S: 21829  
XXXL: 21259  
XL: 19033  
L: 17481  
XXS: 9907  
XXL: 69

In [5]:
df['size'].value_counts()

size
M       29712
S       21924
XXXL    21359
XL      19119
L       17587
XXS      9964
XXL        69
Name: count, dtype: int64

Because there are only very few records for XXL, remove those records from the dataset

In [6]:
df = df[df['size'] != 'XXL']

Train a transformer to fill in the median value of the corresponding attribute for all missing values.

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

df_num = df[['weight','age','height']]

imputer.fit(df_num)
print(imputer.statistics_)

[ 61.   32.  165.1]


Apply the imputer to the dataset and check the results.

In [8]:
df_num_tr = imputer.transform(df_num)
df_num = pd.DataFrame(df_num_tr, columns=df_num.columns, index=df_num.index)
df = pd.concat([df_num, df.drop(['weight','age','height'], axis=1)], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119665 entries, 0 to 119733
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   weight  119665 non-null  float64
 1   age     119665 non-null  float64
 2   height  119665 non-null  float64
 3   size    119665 non-null  object 
dtypes: float64(3), object(1)
memory usage: 4.6+ MB


At first sight this seems quite a large dataset, but is this actually true?  
First we are going to change the datatype of height from float to integer.


In [9]:
df['height'] = df['height'].astype(int)

It seems reasonable to round the ages to the nearest five-fold

In [10]:
df['age'] =  5 * round(df['age']/5)

Change the datatype of age from float to integer.

In [11]:
df['age'] = df['age'].astype(int)

We drop duplicate rows in the dataset.

In [12]:
# df = df.sort_values(by=['weight', 'age', 'height','size'])
df = df.drop_duplicates(keep='first')

How many records are left?

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11330 entries, 0 to 119721
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   weight  11330 non-null  float64
 1   age     11330 non-null  int64  
 2   height  11330 non-null  int64  
 3   size    11330 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 442.6+ KB


We want to know if there are any 'wrong duplicates' in the dataset, i.e. the same values for weight, age and height, but still another size. So we count the nunique

In [14]:
help = df.groupby(['weight', 'age', 'height']).agg('nunique').reset_index()
help.head()

Unnamed: 0,weight,age,height,size
0,22.0,30,167,2
1,22.0,45,152,1
2,26.0,45,172,1
3,31.0,35,175,1
4,35.0,20,182,1


We want to know how many records there are with the same values for weight, age and height, but another value for size.

In [15]:
help[help['size'] != 1].count()

weight    2726
age       2726
height    2726
size      2726
dtype: int64

We decide to remove those records and to keep the first one

In [16]:
df = df.drop_duplicates(subset=['weight', 'age', 'height'], keep='first')

How many records are left?

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5159 entries, 0 to 119682
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   weight  5159 non-null   float64
 1   age     5159 non-null   int64  
 2   height  5159 non-null   int64  
 3   size    5159 non-null   object 
dtypes: float64(1), int64(2), object(1)
memory usage: 201.5+ KB


Check if the dataset is heavily skewed.

In [18]:
df['size'].value_counts()

size
XXXL    2326
M        689
XL       659
S        624
L        473
XXS      388
Name: count, dtype: int64

Because we want to apply regression first, map the sizes to numbers as follows:  
'XXS' : 0, 'S' : 1, 'M': 2, 'L': 3,'XL':4,'XXXL': 5

In [19]:
mapping_dict = {'XXS' : 0, 'S' : 1, 'M': 2, 'L': 3,'XL':4,'XXXL': 5}
df['size'] = df['size'].map(mapping_dict)
df.head()

Unnamed: 0,weight,age,height,size
0,62.0,30,172,4
1,59.0,35,167,3
2,61.0,35,165,2
3,65.0,25,175,3
4,62.0,45,172,2


What is X and what is y?

In [20]:
X = df.drop(['size'], axis = 1)
y = df['size']   

What is X_train, y_train, X_test, y_test?

In [21]:
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)    

What is the shape of X_train, y_train, X_test and y_test?

In [22]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3869, 3) (1290, 3) (3869,) (1290,)


What are columns of X containing text?

In [23]:
categorical_ix = X.select_dtypes(include=['object']).columns
print(categorical_ix)

Index([], dtype='object')


What are the columns of X containing numbers?

In [24]:
numerical_ix = X.select_dtypes(include=[np.number]).columns
print(numerical_ix)

Index(['weight', 'age', 'height'], dtype='object')


Define the ColumnTransformer for applying Standard Scaling on all numeric columns.  

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
col_transform = ColumnTransformer(transformers=[('std_scaler', StandardScaler(), X.columns)])

## Regression

Define the model LinearRegression  

In [26]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

Define the data preparation (= ColumnTransformer for standard scaling) and modeling pipeline

In [27]:
pipeline = Pipeline([('prep',col_transform), ('lin_reg', lin_reg)])

Train the model

In [28]:
pipeline.fit(X_train, y_train)

What is the accuracy of the model?  
Use K-fold cross-validation with k = 3.  
Find an appropriate value for the attribute scoring on [metrics and scoring](https://scikit-learn.org/stable/modules/model_evaluation.html) 

In [29]:
X_train

Unnamed: 0,weight,age,height
31668,61.0,65,167
8667,71.0,45,157
3943,48.0,25,162
408,61.0,35,170
967,53.0,35,157
...,...,...,...
69182,68.0,70,172
659,78.0,40,172
22279,72.0,60,165
40070,95.0,35,154


In [30]:
np.mean(-cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))

np.float64(0.9103762789910395)

What are the values for intercept and the coefficients.  
Why are there 3 coefficients?  
Why do we have 3 coefficients?  
What is the most important coefficient?

In [31]:
lin_reg.intercept_, lin_reg.coef_

(np.float64(3.4166451279400363), array([1.33419981, 0.25076335, 0.0072536 ]))

Apply the model to the test set.  

In [32]:
y_test_predict = pipeline.predict(X_test)

Calculate the Mean Absolute Error and the Root Mean Squared Error

In [35]:
import math
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
print(f"The root mean squared error is {root_mean_squared_error(y_test_predict, y_test)}")
print(f"The mean absolute error is {mean_absolute_error(y_test_predict, y_test)}")

The root mean squared error is 1.1079904464895924
The mean absolute error is 0.9251567530335617


Interprete the results. 

## Classification

Use the softmax classifier to try to predict the class (0, 1, 2, 3, 4, 5).  
What is the accuracy score?

In [36]:
from sklearn.linear_model import LogisticRegression
softmax_reg = LogisticRegression(C=30,random_state=42)
pipeline = Pipeline([('prep',col_transform), ('softmax_reg', softmax_reg)])
pipeline.fit(X_train, y_train)

np.mean(cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=3))




np.float64(0.650301196969788)

Create and show the confusion matrix for the test set.

In [37]:
from sklearn.metrics import confusion_matrix
y_test_predict = pipeline.predict(X_test)

cm = confusion_matrix(y_test, y_test_predict)
print(cm)



[[ 46  35   5   0   0   1]
 [ 21  98  41   0   2   5]
 [  0  25 122   0  12   8]
 [  1   4  57   0  35  14]
 [  0   3  33   0  50  70]
 [  1   1   8   0  28 564]]


The accuracy of the classifier is low, but we see that we often predict only one size too high or too small. 
Calculate how many times 
* the classifier was correct
* the classifier predicted the size to be one size higher than the actual size
* the classifier predicted the size to be one size smaller than the actual size


In [38]:
print(np.diag(cm).sum())
print(np.diag(cm,k=1).sum())
print(np.diag(cm,k=-1).sum())
print(len(y_test))


880
181
131
1290
