# Import Libraries

## For Preprocessing and encoding data

In [6]:
import sklearn
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn import set_config

set_config(transform_output = 'pandas') #Returns any data created by sklearn as a pandas compatible data structure such as a DataFrame


## For Ordinal Encoding

In [19]:
from numpy import asarray

## For imputing missing values

In [35]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

## For creating a pipeline

In [60]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

# Scaling

It's kind of like normalizing pixel values in Computer Vision. It basically just changes the ranges of the values of the dataset. For example, values in the range 0-255 can be converted to 0-1 by dividing each value by 255.

In [7]:
tiny_data = np.array([[1.,-1.,2],
                     [2., 0., 0.],
                     [ 0., 1., -1.]])

scaler = StandardScaler().fit(tiny_data)
scaler

## Seeing information of the new scaled data

### Mean

In [12]:
scaler.mean_

array([1.        , 0.        , 0.33333333])

### Standard deviation

In [14]:
scaler.scale_

array([0.81649658, 0.81649658, 1.24721913])

## Performing the scaling on the dataset values after doing the scaler operations

There is another way to do the scaling and transforming at the same time in one operation, it uses the `fit_transform()`

In [15]:
X_scaled = scaler.transform(tiny_data)

## Information of dataset after being transformed according to the scalling

### Mean

In [16]:
X_scaled.mean()

x0    0.0
x1    0.0
x2    0.0
dtype: float64

### Standard Deviation

In [17]:
X_scaled.std()

x0    1.224745
x1    1.224745
x2    1.224745
dtype: float64

# Encoding values

Encoding is used for transforming categorical data into numeric data. Is useful in many cases such as machine learning.

## Ordinal Encoding

Basically converts each category into a unique integer. Can be very helpful in saving space. Can use a hashmap or even a normal array of strings to map the encodings back to their original values.

### Using Numpy

In [22]:
data = asarray([['data'],['wrangling'],['rocks']])
print(data)

[['data']
 ['wrangling']
 ['rocks']]


### Using Sklearn

#### Creating the ordinal encoder

In [24]:
encoder = OrdinalEncoder()

#### Transforming and fitting the data

In [25]:
encoder.fit_transform(data)

Unnamed: 0,x0
0,0.0
1,2.0
2,1.0


## One Hot Encoding

One hot encoding transform a categorical data into a list and represents the data sort of like a binary value. There is only a single 1 in that list. If the 1 corresponds to a certain category then that data falls under that category.

### Defining one hot encoder

In [31]:
encoder = OneHotEncoder(sparse_output=False)#sparse_output is used to try to compress the large number of zeros in one hot encoding to save on space

### Fitting and transforming the data

In [32]:
encoder.fit_transform(data)

Unnamed: 0,x0_data,x0_rocks,x0_wrangling
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0


# Imputing missing data

## Importing the titanic dataset from openml

In [36]:
x,y = fetch_openml('titanic', version=1, as_frame=True, return_X_y=True, parser='auto')

x_train, x_test, y_train, y_test = train_test_split(x,y,stratify=y)

## Seeing basic information about the dataset

In [39]:
x.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [40]:
y_train.head()

226     0
358     1
879     0
266     0
1119    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']

In [41]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   name       1309 non-null   object  
 2   sex        1309 non-null   category
 3   age        1046 non-null   float64 
 4   sibsp      1309 non-null   int64   
 5   parch      1309 non-null   int64   
 6   ticket     1309 non-null   object  
 7   fare       1308 non-null   float64 
 8   cabin      295 non-null    object  
 9   embarked   1307 non-null   category
 10  boat       486 non-null    object  
 11  body       121 non-null    float64 
 12  home.dest  745 non-null    object  
dtypes: category(2), float64(3), int64(3), object(5)
memory usage: 115.4+ KB


## Checking for missing data in the test set

In [44]:
missing = x_test.isnull().sum() #Counts all the null values in every column
missing = missing[missing > 0 ].sort_values(ascending=False) #Removes any column that has 0 null values

In [45]:
missing

body         289
cabin        262
boat         206
home.dest    141
age           64
fare           1
dtype: int64

## Imputing any missing datausing `SimpleImputer()` with the mean

Just for the `age` and `body` columns

In [46]:
simple_imp = SimpleImputer(missing_values =np.nan, strategy='mean')
simple_imputed = simple_imp.fit_transform(x_test[['age','body']])

## Replacing the imputed columns with the original columns with missing data

In [47]:
x_test[['age','body']] = simple_imputed

## Confirming the missing data is gone from the two columns

In [48]:
missing = x_test.isnull().sum() #Counts all the null values in every column
missing = missing[missing > 0 ].sort_values(ascending=False) #Removes any column that has 0 null values
missing

cabin        262
boat         206
home.dest    141
fare           1
dtype: int64

The missing data has been removed

## Creating a pipeline for the entire process

In [51]:
x_train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
226,1,"Pears, Mr. Thomas Clinton",male,29.0,1,0,113776,66.6000,C2,S,,,"Isleworth, England"
358,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0000,,S,,,"New York, NY"
879,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S,,,
266,1,"Smart, Mr. John Montgomery",male,56.0,0,0,113792,26.5500,,S,,,"New York, NY"
1119,3,"Perkin, Mr. John Henry",male,22.0,0,0,A/5 21174,7.2500,,S,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228,3,"Stranden, Mr. Juho",male,31.0,0,0,STON/O 2. 3101288,7.9250,,S,9,,
1037,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C,,,
1153,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47.0,1,0,A/5. 3337,14.5000,,S,,7.0,
89,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0000,B71,S,,,"Montreal, PQ"


### Creating the column transformer

* The `age` and `fare` columns' missing values are imputed
* The `embarked`, `sex` and `pclass` categorical columns are encoded with one hot encoding

In [55]:
ct = make_column_transformer((make_pipeline(SimpleImputer(),StandardScaler()),['age','fare']),
                            (OneHotEncoder(sparse_output=False),["embarked","sex","pclass"]),
                            verbose_feature_names_out=False)

## Creating the pipline with logistic regression

In [61]:
clf = make_pipeline(ct, LogisticRegression())
clf

## Fitting the data

In [64]:
clf.fit(x_train,y_train)

## Testing the data

In [65]:
clf.score(x_test,y_test)

0.7774390243902439

## Removing logistic regression from the pipeline and performing the normal transformation operation

In [66]:
clf[:-1].transform(x_test)

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_nan,sex_female,sex_male,pclass_1,pclass_2,pclass_3
666,1.163544,-0.366377,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1026,-0.210751,-0.402571,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
566,0.858145,-0.355559,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
652,-0.516150,-0.487072,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1138,0.036542,-0.498121,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
589,-0.058052,-0.210100,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
748,0.323697,-0.367369,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
891,0.247347,-0.472442,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1104,-1.050598,0.095064,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
