In [1]:
%config IPCompleter.greedy=True
import pandas as pd
from io import StringIO

In [2]:
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [3]:
# Checks how many values are null in the dataframe for each variable
df.isnull().sum() 

A    0
B    0
C    1
D    1
dtype: int64

In [4]:
# Dropping rows with NA values
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
# Dropping columns with NA values
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [6]:
# Drops rows with all NaN values
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
# Drops all rows with less than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
# Drops all rows where NaN shows in specific columns (C in this case)
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [9]:
# Instead of removing the NaN values, lets use mean imputation
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
print(imputed_data)

[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]


In [10]:
# Alternative way of filling the NA values
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [11]:
df2 = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])
df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [12]:
# We can map ordinal features to become numerical and hold their precedence 
size_mapping = {'XL':3,
                             'L':2,
                             'M':1}

In [13]:
# How to swap our ordinal mapping into the df
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [14]:
# Swapping the numerical back to ordinal
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df2['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [15]:
# Encoding class labels
class_mapping = {label: idx for idx, label in
                                enumerate(np.unique(df2['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [16]:
df2['classlabel'] = df2['classlabel'].map(class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [17]:
# Inverting the encoded classes
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df2['classlabel'] = df2['classlabel'].map(inv_class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [18]:
# More convenient way 
from sklearn.preprocessing import LabelEncoder
class_1e = LabelEncoder()
y = class_1e.fit_transform(df2['classlabel'].values)
y

array([1, 0, 1])

In [19]:
class_1e.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

In [20]:
# Encoding the strings as integers here
X = df2[['color', 'size', 'price']].values
color_1e = LabelEncoder()
X[:,0] = color_1e.fit_transform(X[:,0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [21]:
# Using a one-hot encoder where the blue green and red values are binary so that we do not assume order
from sklearn.preprocessing import OneHotEncoder
X = df2[['color', 'size', 'price']].values
color_one = OneHotEncoder()
color_one.fit_transform(X[:,0].reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [22]:
# Combining the one hot encoder with the other features, size and price
from sklearn.compose import ColumnTransformer
X = df2[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1,2])
])
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [23]:
# ...just a much easier way of doing this?
# I mean oh my god why did I just hard code this?
pd.get_dummies(df2[['color', 'size', 'price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,False,True,False
1,2,13.5,False,False,True
2,3,15.3,True,False,False


In [24]:
# Reduces correlation between columns
# If we have a 0 for red and green, we can assume that we will have a 1 for blue
pd.get_dummies(df2[['color', 'size', 'price']], drop_first=True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,True,False
1,2,13.5,False,True
2,3,15.3,False,False


In [25]:
# Dropping the first through a column transformer
# This column would be considered a "redundent column" because we are not actually gaining any information from it
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),
    ('nothing', 'passthrough', [1,2])
])
c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

In [26]:
# Using the wine dataset
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                                        'ml/machine-learning-databases/'
                                         'wine/wine.data', header=None)

In [27]:
# Defining the column headings and showing the unique class labels and first 5 entries of the DF
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ssh', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
                                  'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ssh,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [28]:
# Randomly partitions the DS into seperate test and training sets
from sklearn.model_selection import train_test_split
# X = the descriptive features columns
# y = the target feature column
# df_wine.iloc[:, 1:] = selecting all rows and columns from the first index and onwards (descriptive features) 
# .values = converts the selected part of the dataframe into a numpy array so that we can apply machine learning libraries to it
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                                test_size = 0.3,     # Assigned 30% of the dataset to X_test and y_test ; the remaining 70% will be used for training
                                random_state=0,  # Assigned a random state so that our results can be reproduceable
                                stratify=y)            # Ensures the training and test set have the same proportions as the original dataset

In [29]:
# Normalizing the data through min max
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [31]:
# Normalizing the data through standard deviation
from sklearn.preprocessing import StandardScaler 
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)