# **Data Handling**

## **1️⃣ Importing Necessary Libraries**


In [1]:
import pandas as pd  
import numpy as np  
import seaborn as sns  

## **2️⃣ Loading the Titanic Dataset**

In [2]:
# Using seaborn's built-in Titanic dataset  
df = sns.load_dataset('titanic')  

## **3️⃣ Basic Data Exploration**

In [5]:
# Display first 5 rows  
print(df.head()) 
print("\n")

# Display dataset info  
print(df.info())  
print("\n")

# Summary statistics  
print(df.describe())
print("\n")

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-

## **4️⃣ Handling Missing Data**

In [None]:
# Checking for missing values  
print(df.isnull().sum())  

# Filling missing values with median  
df['age'].fillna(df['age'].median(), inplace=True)  

# Dropping rows with missing values  
df.dropna(subset=['embark_town'], inplace=True)  

## **5️⃣ Handling Duplicates**

In [7]:
# Checking for duplicates  
print(df.duplicated().sum())  

# Removing duplicate rows  
df = df.drop_duplicates()  

110


## **6️⃣ Selecting Specific Columns & Rows**

In [14]:
# Selecting a single column  
age_column = df['age']  
print(age_column.head(),"\n")

# Selecting multiple columns  
selected_columns = df[['sex', 'age', 'fare']]  
print(selected_columns.head(),"\n")

# Filtering rows where age > 30  
filtered_rows = df[df['age'] > 30] 
print(filtered_rows.head())

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64 

      sex   age     fare
0    male  22.0   7.2500
1  female  38.0  71.2833
2  female  26.0   7.9250
3  female  35.0  53.1000
4    male  35.0   8.0500 

    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
1          1       1  female  38.0      1      0  71.2833        C  First   
3          1       1  female  35.0      1      0  53.1000        S  First   
4          0       3    male  35.0      0      0   8.0500        S  Third   
6          0       1    male  54.0      0      0  51.8625        S  First   
11         1       1  female  58.0      0      0  26.5500        S  First   

      who  adult_male deck  embark_town alive  alone  
1   woman       False    C    Cherbourg   yes  False  
3   woman       False    C  Southampton   yes  False  
4     man        True  NaN  Southampton    no   True  
6     man        True    E  Southampton    no   True  
11  woman       False    C  Sout

## **7️⃣ Data Type Conversions**

In [15]:
# Converting categorical 'sex' column to numeric  
df['sex'] = df['sex'].map({'male': 0, 'female': 1})  


## **8️⃣ Sorting & Indexing**

In [None]:
# Sorting by age  
df_sorted = df.sort_values(by='age', ascending=False)  

# Setting 'embarked' as index  
df.set_index('embarked', inplace=True)  

print(df.head())

          survived  pclass  sex   age  sibsp  parch     fare  class    who  \
embarked                                                                     
S                0       3    0  22.0      1      0   7.2500  Third    man   
C                1       1    1  38.0      1      0  71.2833  First  woman   
S                1       3    1  26.0      0      0   7.9250  Third  woman   
S                1       1    1  35.0      1      0  53.1000  First  woman   
S                0       3    0  35.0      0      0   8.0500  Third    man   

          adult_male deck  embark_town alive  alone  
embarked                                             
S               True  NaN  Southampton    no  False  
C              False    C    Cherbourg   yes  False  
S              False  NaN  Southampton   yes   True  
S              False    C  Southampton   yes  False  
S               True  NaN  Southampton    no   True  


## **9️⃣ Merging & Concatenation**


In [24]:
# Creating a dummy dataframe 
extra_data = pd.DataFrame({'id': [1, 2, 3], 'extra_info': ['A', 'B', 'C']})  

# Merging with the original dataset  
df_merged = df.merge(extra_data, left_on='pclass', right_on='id', how='left')  
 

## **🔚 Conclusion**

In [25]:
# Display final dataframe  
print(df.head())  

          survived  pclass  sex   age  sibsp  parch     fare  class    who  \
embarked                                                                     
S                0       3    0  22.0      1      0   7.2500  Third    man   
C                1       1    1  38.0      1      0  71.2833  First  woman   
S                1       3    1  26.0      0      0   7.9250  Third  woman   
S                1       1    1  35.0      1      0  53.1000  First  woman   
S                0       3    0  35.0      0      0   8.0500  Third    man   

          adult_male deck  embark_town alive  alone  
embarked                                             
S               True  NaN  Southampton    no  False  
C              False    C    Cherbourg   yes  False  
S              False  NaN  Southampton   yes   True  
S              False    C  Southampton   yes  False  
S               True  NaN  Southampton    no   True  
