# Preprocessing

In [4]:
# imports and configs
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer


PROJECT_ROOT = f"{os.path.expanduser('~')}/project/AIAA5030_Data_Mining_Group13_Visualization"
DATA_ROOT = f"{PROJECT_ROOT}/data"


## Titanic-Dataset

Use Pclass, Sex, Age, SibSp, Parch, Fare, and Embarked features of Titantic dataset:

• Process each feature and make their values in the range 0 to 1

In [6]:
# load Titanic dataset
Titantic_dataset = pd.read_csv(f'{DATA_ROOT}/Titanic-Dataset.csv')
Titantic_dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
# using min-max scaling to process Pclass
Pclass_min_max_scaler = MinMaxScaler()
Titantic_dataset['Pclass'] = Pclass_min_max_scaler.fit_transform(Titantic_dataset[['Pclass']])

# using label encoding to process Sex
Sex_label_encoder = LabelEncoder()
Titantic_dataset['Sex'] = Sex_label_encoder.fit_transform(Titantic_dataset['Sex'])

# using min-max scaling to process Age, SibSp, Parch, Fare, Embarked
Age_min_max_scaler = MinMaxScaler()
Titantic_dataset['Age'] = Age_min_max_scaler.fit_transform(Titantic_dataset[['Age']])
SibSp_min_max_scaler = MinMaxScaler()
Titantic_dataset['SibSp'] = SibSp_min_max_scaler.fit_transform(Titantic_dataset[['SibSp']])
Parch_min_max_scaler = MinMaxScaler()
Titantic_dataset['Parch'] = Parch_min_max_scaler.fit_transform(Titantic_dataset[['Parch']])
Fare_min_max_scaler = MinMaxScaler()
Titantic_dataset['Fare'] = Fare_min_max_scaler.fit_transform(Titantic_dataset[['Fare']])

# using label encoding + min-max scaling to process Embarked
Embarked_label_encoder = LabelEncoder()
Titantic_dataset['Embarked'] = Embarked_label_encoder.fit_transform(Titantic_dataset['Embarked'])
Embarked_min_max_scaler = MinMaxScaler()
Titantic_dataset['Embarked'] = Embarked_min_max_scaler.fit_transform(Titantic_dataset[['Embarked']])

Titantic_dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1.0,"Braund, Mr. Owen Harris",1,0.271174,0.125,0.000000,A/5 21171,0.014151,,0.666667
1,2,1,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.472229,0.125,0.000000,PC 17599,0.139136,C85,0.000000
2,3,1,1.0,"Heikkinen, Miss. Laina",0,0.321438,0.000,0.000000,STON/O2. 3101282,0.015469,,0.666667
3,4,1,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.434531,0.125,0.000000,113803,0.103644,C123,0.666667
4,5,0,1.0,"Allen, Mr. William Henry",1,0.434531,0.000,0.000000,373450,0.015713,,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,0.5,"Montvila, Rev. Juozas",1,0.334004,0.000,0.000000,211536,0.025374,,0.666667
887,888,1,0.0,"Graham, Miss. Margaret Edith",0,0.233476,0.000,0.000000,112053,0.058556,B42,0.666667
888,889,0,1.0,"Johnston, Miss. Catherine Helen ""Carrie""",0,,0.125,0.333333,W./C. 6607,0.045771,,0.666667
889,890,1,0.0,"Behr, Mr. Karl Howell",1,0.321438,0.000,0.000000,111369,0.058556,C148,0.000000


• Fill missing value for each feature

In [8]:
from sklearn.impute import SimpleImputer, KNNImputer
filter_NaN_feature_name = Titantic_dataset.columns[Titantic_dataset.isna().any()].tolist()
print("Features with NaN values:", filter_NaN_feature_name)

Features with NaN values: ['Age', 'Cabin']


In [9]:
# fill missing value for Age and Cabin features
# age_imputer = SimpleImputer(strategy='mean')
age_imputer = KNNImputer(n_neighbors=5, weights='uniform')
Titantic_dataset[['Age']] = age_imputer.fit_transform(Titantic_dataset[['Age']])
Cabin_imputer = SimpleImputer(strategy='most_frequent')
Titantic_dataset[['Cabin']] = Cabin_imputer.fit_transform(Titantic_dataset[['Cabin']])
Titantic_dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1.0,"Braund, Mr. Owen Harris",1,0.271174,0.125,0.000000,A/5 21171,0.014151,B96 B98,0.666667
1,2,1,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.472229,0.125,0.000000,PC 17599,0.139136,C85,0.000000
2,3,1,1.0,"Heikkinen, Miss. Laina",0,0.321438,0.000,0.000000,STON/O2. 3101282,0.015469,B96 B98,0.666667
3,4,1,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.434531,0.125,0.000000,113803,0.103644,C123,0.666667
4,5,0,1.0,"Allen, Mr. William Henry",1,0.434531,0.000,0.000000,373450,0.015713,B96 B98,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,0.5,"Montvila, Rev. Juozas",1,0.334004,0.000,0.000000,211536,0.025374,B96 B98,0.666667
887,888,1,0.0,"Graham, Miss. Margaret Edith",0,0.233476,0.000,0.000000,112053,0.058556,B42,0.666667
888,889,0,1.0,"Johnston, Miss. Catherine Helen ""Carrie""",0,0.367921,0.125,0.333333,W./C. 6607,0.045771,B96 B98,0.666667
889,890,1,0.0,"Behr, Mr. Karl Howell",1,0.321438,0.000,0.000000,111369,0.058556,C148,0.000000


• Save the processed dataset as a new CSV file

In [10]:
# Save the processed dataset as a new CSV file
Titantic_dataset.to_csv(f'{DATA_ROOT}/Titanic-Dataset-Processed.csv', index=False)