In [2]:
import pandas as pd
import seaborn as sns
import numpy as np

# Load Titanic dataset
df = sns.load_dataset('titanic')

In [3]:
# ---------------- 1. Identify and handle missing values ----------------
print("Missing values before:")
print(df.isnull().sum())

Missing values before:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [6]:
# Fill missing age with median, embarked with mode, deck with "Unknown"
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)
df['embark_town'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)


In [7]:
# ---------------- 2. Remove duplicate rows ----------------
df.drop_duplicates(inplace=True)


In [8]:
# ---------------- 3. Standardize text values ----------------
# Example: make 'sex' and 'embarked' lowercase
df['sex'] = df['sex'].str.lower()
df['embarked'] = df['embarked'].str.upper()

In [9]:
# Standardizing 'embark_town' values
df['embark_town'] = df['embark_town'].replace({
    'Southampton': 'southampton',
    'Cherbourg': 'cherbourg',
    'Queenstown': 'queenstown',
    'Unknown': 'unknown'
})

In [10]:
# ---------------- 4. Convert date formats ----------------
# Simulate a date column for demo purposes
df['fake_dob'] = pd.to_datetime('1900-01-01') + pd.to_timedelta(np.random.randint(0, 30000, size=len(df)), unit='D')

# Convert date format to dd-mm-yyyy string format
df['fake_dob'] = df['fake_dob'].dt.strftime('%d-%m-%Y')

In [11]:
# ---------------- 5. Rename column headers ----------------
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# ---------------- 6. Check and fix data types ----------------
# Age should be int
df['age'] = df['age'].astype(int)

In [12]:
# Convert fake_dob back to datetime
df['fake_dob'] = pd.to_datetime(df['fake_dob'], format='%d-%m-%Y')

In [13]:
# Final dataset preview
print("\nCleaned DataFrame Head:")
print(df.head())


Cleaned DataFrame Head:
   survived  pclass     sex  age  sibsp  parch     fare embarked  class  \
0         0       3    male   22      1      0   7.2500        S  Third   
1         1       1  female   38      1      0  71.2833        C  First   
2         1       3  female   26      0      0   7.9250        S  Third   
3         1       1  female   35      1      0  53.1000        S  First   
4         0       3    male   35      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone   fake_dob  
0    man        True  NaN  southampton    no  False 1944-10-23  
1  woman       False    C    cherbourg   yes  False 1956-03-05  
2  woman       False  NaN  southampton   yes   True 1946-09-09  
3  woman       False    C  southampton   yes  False 1923-06-10  
4    man        True  NaN  southampton    no   True 1957-05-16  


In [14]:
print("\nData types after cleaning:")
print(df.dtypes)


Data types after cleaning:
survived                int64
pclass                  int64
sex                    object
age                     int32
sibsp                   int64
parch                   int64
fare                  float64
embarked               object
class                category
who                    object
adult_male               bool
deck                 category
embark_town            object
alive                  object
alone                    bool
fake_dob       datetime64[ns]
dtype: object
