In [2]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)


After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


In [3]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [10]:
import kagglehub
import pandas as pd
import os

# Download dataset
path = kagglehub.dataset_download("yasserh/titanic-dataset")

print("Path to dataset files:", path)
print("Files in dataset folder:", os.listdir(path))

# Load the dataset correctly
df = pd.read_csv(path + "/Titanic-Dataset.csv")

# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())


Path to dataset files: /home/anaconda/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1
Files in dataset folder: ['Titanic-Dataset.csv']
Missing values in each column:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [7]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.13


In [11]:
# Normalize numerical columns

from sklearn.preprocessing import MinMaxScaler

# Select only numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Create the scaler
scaler = MinMaxScaler()

# Apply normalization
df[num_cols] = scaler.fit_transform(df[num_cols])

print("Normalized numerical columns:\n", df[num_cols].head())


Normalized numerical columns:
    PassengerId  Survived  Pclass       Age  SibSp  Parch      Fare
0     0.000000       0.0     1.0  0.271174  0.125    0.0  0.014151
1     0.001124       1.0     0.0  0.472229  0.125    0.0  0.139136
2     0.002247       1.0     1.0  0.321438  0.000    0.0  0.015469
3     0.003371       1.0     0.0  0.434531  0.125    0.0  0.103644
4     0.004494       0.0     1.0  0.434531  0.000    0.0  0.015713


In [12]:
# Standardize categorical columns and remove duplicates

# Select only object (categorical) columns
cat_cols = df.select_dtypes(include=['object']).columns

# Make all categorical data lowercase (standardized)
df[cat_cols] = df[cat_cols].apply(lambda x: x.str.lower())

# Remove duplicate rows
df.drop_duplicates(inplace=True)

print("Standardized categorical columns and removed duplicates.\n")
print(df.head())


Standardized categorical columns and removed duplicates.

   PassengerId  Survived  Pclass  \
0     0.000000       0.0     1.0   
1     0.001124       1.0     0.0   
2     0.002247       1.0     1.0   
3     0.003371       1.0     0.0   
4     0.004494       0.0     1.0   

                                                Name     Sex       Age  SibSp  \
0                            braund, mr. owen harris    male  0.271174  0.125   
1  cumings, mrs. john bradley (florence briggs th...  female  0.472229  0.125   
2                             heikkinen, miss. laina  female  0.321438  0.000   
3       futrelle, mrs. jacques heath (lily may peel)  female  0.434531  0.125   
4                           allen, mr. william henry    male  0.434531  0.000   

   Parch            Ticket      Fare Cabin Embarked  
0    0.0         a/5 21171  0.014151   NaN        s  
1    0.0          pc 17599  0.139136   c85        c  
2    0.0  ston/o2. 3101282  0.015469   NaN        s  
3    0.0            11