In [17]:
# Data Preprocessing - Titanic Dataset
## Day 02 of 30-Day ML/AI Challenge

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [18]:
# Load the Titanic dataset
titanic = pd.read_csv('titanic.csv')
titanic = titanic.copy()

# Inspect the dataset
print(titanic.info())
print(titanic.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.48659

In [19]:
# Fill missing values in the 'Age' column with the median
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

# Fill missing values in the 'Embarked' column with the mode
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])


In [20]:
# Normalize Age and Fare columns
scaler = MinMaxScaler()
titanic[['Age', 'Fare']] = scaler.fit_transform(titanic[['Age', 'Fare']])


In [21]:
# One-Hot Encoding for Embarked
titanic = pd.get_dummies(titanic, columns=['Embarked'], drop_first=True)

# Binary Encoding for Sex
titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})


In [22]:
# Separate features and target
X = titanic.drop(['Survived'], axis=1)
y = titanic['Survived']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# Display the cleaned and preprocessed dataset
print("Preprocessed Data:")
print(titanic.head())

# Save the preprocessed dataset to a CSV file for further use
titanic.to_csv("preprocessed_titanic.csv", index=False)

Preprocessed Data:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex       Age  SibSp  \
0                            Braund, Mr. Owen Harris    0  0.271174      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  0.472229      1   
2                             Heikkinen, Miss. Laina    1  0.321438      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  0.434531      1   
4                           Allen, Mr. William Henry    0  0.434531      0   

   Parch            Ticket      Fare Cabin  Embarked_Q  Embarked_S  
0      0         A/5 21171  0.014151   NaN       False        True  
1      0          PC 17599  0.139136   C85       False       False  
2      0  STON/O2. 3101282  0.015469   NaN       False        True  
3      0           