In [1]:
import pandas as pd
import numpy as np

# Loading the dataset

In [2]:
df=pd.read_csv('dataset/Titanic-Dataset.csv')

# Handling Null Values

In [3]:
# Check for null values (1 line)
display(df.isnull().sum())

# Fill null values for 'Embarked' with the mode(1 line)
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

# Drop rows with null values in 'Cabin' (or alternatively, you can fill it with a placeholder) (1 line)
df.dropna(subset=['Cabin'], inplace=True)

## The alternative way
# The selection depend on knowing the values of Cabin column and the filling will be depend on the mode if the same row have the same conditions in another rows in Sex & SibSp & Parch & Embarked & Pclass & Survived
def fill_placeholder(row):
    if pd.isnull(row['Cabin']):
        condition = (df['Sex'] == row["Sex"]) & (df['SibSp'] == row["SibSp"]) & (df['Parch'] == row["Parch"]) & (df['Embarked'] == row["Embarked"]) & (df['Pclass'] == row["Pclass"]) & (df['Survived'] == row["Survived"])

        selected_conditions = df[condition]
        
        if len(selected_conditions) == 0 or selected_conditions["Cabin"].mode().empty:
            row["Cabin"] = None
        else:
            row["Cabin"] = selected_conditions["Cabin"].mode()[0]
    return row

df = df.apply(fill_placeholder, axis=1)
df["Cabin"].fillna("Unknown", inplace=True)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Handling Duplicates

In [4]:
# Check for duplicates (1 line)
display(df.duplicated().sum())

# Drop duplicates if any (1 line)
df.drop_duplicates()

0

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


# Handling Outliers

In [5]:
import numpy as np

# Function to remove outliers using the IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from 'Fare' using the remove_outilers function (1 line)
remove_outliers(df,"Fare")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


# Scaling and Normalization

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard scaling for 'Fare' (2 lines)
scaler = StandardScaler()
df['Fare'] = scaler.fit_transform(df[['Fare']])

# Min-Max scaling for 'Age' (2 lines)
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

# Encoding Categorical Variables

In [7]:
# One-hot encoding for 'Embarked' and 'Sex' (1 line)
OHE_df = pd.get_dummies(df,columns=["Embarked","Sex"])