# Data Preprocessing Lab
by Wilmer Garzón, last updated: 26-June-2025

In this lab, you will work on preprocessing the Titanic dataset. Follow the instructions and complete the tasks.

## 1. Load the Data

Load the Titanic dataset into a pandas DataFrame.

In [None]:
import pandas as pd

# Load the dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## 2. Data Cleaning

### a. Handle Missing Values
- Identify and handle missing values in the dataset.
- Filla some NaN values by: mean and mode

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Check for missing values
# Check the condition of the missing values
missing_values = data.isnull().sum()
print(missing_values)

# Embarked we will fill the missing values with mode of the same column
mode_embarked = data['Embarked'].mode()[0]
data['Embarked'].fillna(mode_embarked, inplace=True)

# Age, lets check what number will be the average before and after if the missing values was adjusted byy the mean too
mean_age = data['Age'].mean()
print(f"Mean age: {mean_age}")
mean_age_replace_2 = data['Age'].fillna(mean_age).mean()
print(f"Mean age after replace: {mean_age_replace_2}")

# After seeing that dont change nothing using the mean in the age, so lets replace the missing values in age for the respective mean
mean_age = data['Age'].mean()
data['Age'].fillna(mean_age, inplace=True)

# Now lets see how much information it have inside of the columns cabin. Show the existing values inside cabin
cabin = data['Cabin']
print(cabin.unique())
# For 981 values possibles Cabin misses 67, its a lot, lets try replace using KNN, trying tho use others parameters as reference too
# data['Has_Cabin'] = data['Cabin'].notnull().astype(int)
# impute_cols = ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch', 'Has_Cabin']

# imputer = KNNImputer(n_neighbors=10)

# data_impute = data[impute_cols]

# data_imputed = imputer.fit_transform(data_impute)

# # Replace the original data
# for i, col in enumerate(impute_cols):
#     data[col] = data_imputed[:, i]

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Mean age: 29.69911764705882
Mean age after replace: 29.69911764705882
[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' '

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(mode_embarked, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(mean_age, inplace=True)


In [None]:
# check missing values after imputation
missing_values = data.isnull().sum()
print(missing_values)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Has_Cabin        0
dtype: int64


In [None]:
# Decide to delete the column cabin
data_without_cabin = data.drop('Cabin', axis=1)
data_without_cabin.info()

data_without_cabin.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    float64
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    float64
 7   Parch        891 non-null    float64
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
 11  Has_Cabin    891 non-null    float64
dtypes: float64(6), int64(2), object(4)
memory usage: 83.7+ KB


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


### b. Remove Duplicates

Check for and remove any duplicate rows.

In [None]:
# Check for duplicates
# Remove values duplicates considering duplicate only if it happens in all the features
data_without_cabin = data_without_cabin.drop_duplicates()
data_without_cabin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    float64
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    float64
 7   Parch        891 non-null    float64
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
 11  Has_Cabin    891 non-null    float64
dtypes: float64(6), int64(2), object(4)
memory usage: 83.7+ KB


## 3. Data Transformation

### a. Encode Categorical Variables

Convert categorical variables into numerical values using one-hot encoding.

In [None]:
# Encode categorical variables
# Convert categorical variables into numerical values using one-hot encoding. How many variables categorial it have
categorical_variables = data_without_cabin.select_dtypes(include=['object'])
print(categorical_variables.columns)

# Let convert all of them into numerial using one-hot encoding
data_encoded = pd.get_dummies(data_without_cabin, columns=categorical_variables.columns)
data_encoded.info()

# Replace the data encoded in their values in data without cabin
data_without_cabin = data_encoded


Index([], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 1585 entries, PassengerId to Embarked_S
dtypes: bool(1577), float64(6), int64(2)
memory usage: 1.4 MB


### b. Normalize Numerical Variables

Normalize numerical variables to have a mean of 0 and a standard deviation of 1.

In [None]:
# Normalize numerical variables
# Normalize numerical variables to have a mean of 0 and a standard deviation of 1
from sklearn.preprocessing import MinMaxScaler
data_encoded = data_without_cabin.copy()
numeric_cols = data_encoded.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
data_encoded[numeric_cols] = scaler.fit_transform(data_encoded[numeric_cols])

print(data_encoded[numeric_cols].head())


   PassengerId  Survived  Pclass       Age  SibSp  Parch      Fare  Has_Cabin
0     0.000000       0.0     1.0  0.271174  0.125    0.0  0.014151        0.0
1     0.001124       1.0     0.0  0.472229  0.125    0.0  0.139136        1.0
2     0.002247       1.0     1.0  0.321438  0.000    0.0  0.015469        0.0
3     0.003371       1.0     0.0  0.434531  0.125    0.0  0.103644        1.0
4     0.004494       0.0     1.0  0.434531  0.000    0.0  0.015713        0.0


## 4. Feature Selection

Select relevant features for the model.

In [None]:
# Select features
# Ho the features are correlated to the Survived
import matplotlib.pyplot as plt

# Correlation
correlation = data_encoded.corr()['Survived'].sort_values(ascending=False)
print(correlation)

Survived           1.000000
Sex_female         0.543351
Has_Cabin          0.316912
Fare               0.257307
Embarked_C         0.168240
                     ...   
Ticket_347082     -0.070234
Ticket_CA. 2343   -0.070234
Embarked_S        -0.149683
Pclass            -0.338481
Sex_male          -0.543351
Name: Survived, Length: 1585, dtype: float64


## 5. Split the Data

Split the dataset into training and testing sets.

In [None]:
# Split the data
from sklearn.model_selection import train_test_split

# Split the data 80% into training and 20% testing sets
x_train, x_test, y_train, y_test = train_test_split(data_encoded.drop('Survived', axis=1), data_encoded['Survived'], test_size=0.2, random_state=42)
