In [2]:
import pandas as pd

# load the titanic dataset
df=pd.read_csv('titanic/train.csv')

# display the first five rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# inspect the data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
# check the specific missing values in the dataset
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
df.isnull()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [10]:
# get summary statistics for the numerical columns
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
# we want to handle missing values
# we have two options:
# 1. Drop the rows with missing values
# 2. Fill the missing values with a specific value (mean, median, mode, etc.)
# let's drop the rows with missing values
df.drop(columns=['Cabin'],inplace=True)
# check the missing values again
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [12]:
# # we want to fill the missing values in the Age column with the meandian age For Age, we can fill missing values with the median age (since the median is less sensitive to outliers than the mean).
# For Embarked, we can fill missing values with the most frequent value (mode).
df['Age'].fillna(df['Age'].median(),inplace=True)

df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)
# check the missing values again
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(),inplace=True)


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [15]:
# handle duplicates 
# frist we need to check if there are any duplicates in the dataset
duplicates = df.duplicated().sum()
print(f"Number of duplicates: {duplicates}")
# if there are duplicates, we can drop them
if duplicates > 0:
    df.drop_duplicates(inplace=True)

Number of duplicates: 0


In [16]:
# handle outliers
# we can use the IQR method to detect outliers
# Calculate IQR for 'Fare'
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df = df[(df['Fare'] >= lower_bound) & (df['Fare'] <= upper_bound)]

# Verify
df.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,775.0,775.0,775.0,775.0,775.0,775.0,775.0
mean,445.806452,0.339355,2.48,28.74871,0.437419,0.340645,17.822091
std,260.116285,0.473796,0.73439,12.782123,0.899838,0.785914,13.578085
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,213.5,0.0,2.0,22.0,0.0,0.0,7.8958
50%,450.0,0.0,3.0,28.0,0.0,0.0,13.0
75%,670.5,1.0,3.0,34.0,1.0,0.0,26.0
max,891.0,1.0,3.0,80.0,5.0,6.0,65.0


In [17]:
# convert categorical variables to numerical variables for easier analysis

df['Sex']=df['Sex'].map({'male':0,'female':1})

# convert the Embarked column to numerical values using one-hot encoding
df=pd.get_dummies(df, columns=['Embarked'], drop_first=True)
# check the first five rows of the dataset
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,False,True
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,False,True
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,False,True
5,6,0,3,"Moran, Mr. James",0,28.0,0,0,330877,8.4583,True,False


In [27]:
import os

# Print the current working directory
print("Current Working Directory:", os.getcwd())

# Print the full output path
output_path = os.path.join(os.getcwd(), "data", "cleaned_titanic.csv")
print("Output Path:", output_path)

Current Working Directory: c:\Users\Jason\Documents\personal work\3month course on ML\data manipulation
Output Path: c:\Users\Jason\Documents\personal work\3month course on ML\data manipulation\data\cleaned_titanic.csv


In [32]:
import os

# Define the output path
output_path = os.path.join(os.getcwd(), "data", "cleaned_titanic.csv")

# Ensure the directory exists
directory = os.path.dirname(output_path)
if directory:  # Check if the directory path is not empty
	os.makedirs(directory, exist_ok=True)

# Save the file using the existing dataframe
df.to_csv(output_path, index=False)

print(f"File saved successfully at: {output_path}")

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'c:\\Users\\Jason\\Documents\\personal work\\3month course on ML\\data manipulation\\data'

In [39]:
import os

# Define the output path
output_path = r'C:\Users\Jason\Documents\personal work\3month course on ML\data manipulation\cleaned_titanic.csv'

# Ensure the directory exists
directory = os.path.dirname(output_path)
if directory:  # Check if the directory path is not empty
	os.makedirs(directory, exist_ok=True)

# Save the file using the existing dataframe
df.to_csv(output_path, index=False)

print(f"File saved successfully at: {output_path}")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Jason\\Documents\\personal work\\3month course on ML\\data manipulation\\cleaned_titanic.csv'