In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

# Handling Categorical Data

In [3]:
T_shirts = [
    ['green', 'M', 99.9, 'polo'],
    ['red', 'L', 199.9, 'sleeve'],
    ['blue', 'XL', 599.9, 'high_neck']
]
df = pd.DataFrame(T_shirts)
df.columns = ['color', 'size', 'price', 'tshirt_type']
df

Unnamed: 0,color,size,price,tshirt_type
0,green,M,99.9,polo
1,red,L,199.9,sleeve
2,blue,XL,599.9,high_neck


## handling labels

In [4]:
df.loc[df.tshirt_type == 'polo'] = 0
df.loc[df.tshirt_type == 'sleeve'] = 1
df.loc[df.tshirt_type == 'high_neck'] = 2

In [5]:
np.unique(df['tshirt_type'])

array([0, 1, 2], dtype=object)

In [6]:
for label in np.unique(df['tshirt_type']):
  print(label)

0
1
2


In [7]:
{label for label in np.unique(df['tshirt_type'])}

{0, 1, 2}

In [8]:
label_maping = {label:idx for idx,label in enumerate(np.unique(df['tshirt_type']))}

In [9]:
df['tshirt_type'] = df['tshirt_type'].map(label_maping)
df

Unnamed: 0,color,size,price,tshirt_type
0,0,0,0.0,0
1,1,1,1.0,1
2,2,2,2.0,2


In [10]:
df

Unnamed: 0,color,size,price,tshirt_type
0,0,0,0.0,0
1,1,1,1.0,1
2,2,2,2.0,2


In [11]:
df['tshirt_type'].values.shape

(3,)

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['tshirt_type'].values) #1-dim array
y

array([0, 1, 2])

## handling categorical features

### ordinal encoding

In [13]:
size_mapping = {feat:idx for idx,feat in enumerate(np.unique(df['size']))}
size_mapping

{0: 0, 1: 1, 2: 2}

In [14]:
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,tshirt_type
0,0,0,0.0,0
1,1,1,1.0,1
2,2,2,2.0,2


In [15]:
df['size'].values

array([0, 1, 2])

In [16]:
df['class'] = ['classA', 'classB', 'classC']

In [17]:
df

Unnamed: 0,color,size,price,tshirt_type,class
0,0,0,0.0,0,classA
1,1,1,1.0,1,classB
2,2,2,2.0,2,classC


In [18]:
df[['size', 'class']].values.reshape(-1, 2)

array([[0, 'classA'],
       [1, 'classB'],
       [2, 'classC']], dtype=object)

In [19]:
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder()
df[['size', 'class']] = ord_encoder.fit_transform(df[['size', 'class']].values.reshape(-1, 2)) #error

In [20]:
df['size'].values.reshape(1, -1)

array([[0., 1., 2.]])

### nominal encoding

In [21]:
from sklearn.preprocessing import OneHotEncoder
one_hot_enc = OneHotEncoder(sparse=False)
one_hot_enc.fit_transform(df['color'].values.reshape(-1,1))

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

## column transformer

In [22]:
df

Unnamed: 0,color,size,price,tshirt_type,class
0,0,0.0,0.0,0,0.0
1,1,1.0,1.0,1,1.0
2,2,2.0,2.0,2,2.0


label encoding

In [23]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['tshirt_type'].values)
y

array([0, 1, 2])

encoding features

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
X = df[['color', 'size', 'price', 'class']].values

one_hot_encoder = OneHotEncoder(sparse=False)
ord_encoder = OrdinalEncoder()

cl_trns = ColumnTransformer([
    ('onehot', one_hot_encoder, [0]),
    ('ordenc', ord_encoder, [1,3]),
    ('nothing', 'passthrough', [2])
])
X_trns = cl_trns.fit_transform(X)
X_trns


array([[1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [0.0, 1.0, 0.0, 1.0, 1.0, 1.0],
       [0.0, 0.0, 1.0, 2.0, 2.0, 2.0]], dtype=object)

# Data Scaling
If you standardize them separately afterwards, your model will receive a slightly different range of values than what it was trained on, so it would output biased predictions.

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
df_wine = pd.read_csv('/content/drive/MyDrive/Introduction to Machine Learning/Notebooks/ch04/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']
df_wine

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
X = df_wine.iloc[:, 1:]
y = df_wine.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=0)

lr = LogisticRegression()
lr = lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_pred, y_test)

0.9550561797752809

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
knn = KNeighborsClassifier()
knn = knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_pred, y_test)

0.7640449438202247

In [30]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler() 
X_train_norm = mms.fit_transform(X_train) 
X_test_norm = mms.transform(X_test) 


knn = KNeighborsClassifier()
knn = knn.fit(X_train_norm, y_train)
y_pred = knn.predict(X_test_norm)
accuracy_score(y_pred, y_test)

0.9662921348314607

In [31]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_std = stdsc.fit_transform(X_train) 
X_test_std = stdsc.transform(X_test)

knn = KNeighborsClassifier()
knn = knn.fit(X_std, y_train)
y_pred = knn.predict(X_test_std)
accuracy_score(y_pred, y_test)

0.9662921348314607

# Handling missing data

In [32]:
#..

In [33]:
from io import StringIO
import sys

csv_data = \
'''
A,B,C,D
1, 2, 3, 4
5, 6,, 4
10, 11, 12,
'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,4.0
2,10,11,12.0,


In [34]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [35]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [36]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0


In [37]:
df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,4.0
2,10,11,12.0,


In [38]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1,2
1,5,6
2,10,11


In [39]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
df.dropna(subset=[' C', ' D'], inplace=True)

In [None]:
df.drop(columns=['C'])

# Imputing missing values

In [41]:
from sklearn.impute import SimpleImputer
simple_imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=50) # 'mean', 'median', 'most_frequent', 'constant'
simple_imp = simple_imp.fit(df.values)
X_imputed = simple_imp.transform(df.values)
X_imputed

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., 50.,  4.],
       [10., 11., 12., 50.]])

In [None]:
df.fillna(0)

In [None]:
mean_cols = df.mean()
df.fillna(mean_cols)

In [None]:
median_cols = df.mode()
df.fillna(median_cols)

In [None]:
max_cols = df.max()
df.fillna(max_cols)

In [None]:
min_cols = df.max()
df.fillna(min_cols)

# Principal Component Analysis

In [42]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_std, y_train)
y_pred = knn.predict(X_test_std)
accuracy_score(y_pred, y_test)

0.9662921348314607

In [47]:
X_std.shape

(89, 13)

In [59]:
from sklearn.decomposition import PCA
pca = PCA(n_components=7)
X_train_pca = pca.fit_transform(X_std)
X_test_pca = pca.transform(X_test_std)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca, y_train)
y_pred = knn.predict(X_test_pca)
accuracy_score(y_pred, y_test)

0.9887640449438202

# Feature Selection