# Exploratory Data Analysis
This will show us how we can do EDA using python

## Three important steps to keep in mind are:
1. Understand the data
2. Clean the data
3. Find a relationship between data



# Understand the data

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load kashti dataset
kashti = sns.load_dataset("titanic")
ks = kashti.copy()

In [None]:
# display the first few rows of the dataset
ks.head()

In [None]:
# display the shape of the dataset
ks.shape

In [None]:
# display the data types of each column
ks.info()

In [None]:
# display summary statistics of the dataset
ks.describe()

In [None]:
# display unique values of each column
ks.nunique()

In [None]:
# column names
ks.columns

In [None]:
# unique values in sex
ks['sex'].unique()

In [None]:
# display unique values of all columns
for col in ks.columns:
    print(f"{col}: {ks[col].unique()}")

# Cleaning and Filtering the data


In [None]:
# find missing values inside the dataset
ks.isnull().sum()


In [None]:
# drop deck(as it contains too many missing values) and add it into new variable -  (cleaning data)
ks_clean = ks.drop(columns=['deck'])
# ks_clean = ks.drop(['deck'], axis=1)
ks_clean.head()

In [None]:
# remove all rows containing null(missing) values in cells
ks_clean = ks_clean.dropna()

In [None]:
# display the shape of the cleaned dataset
ks_clean.shape

In [None]:
# check missing values in dataset after cleaning
ks_clean.isnull().sum()

In [None]:
# value counts of survived
ks_clean['sex'].value_counts()

In [None]:
print(ks.describe())
print(ks_clean.describe())

In [None]:
# change data type of 'age' column to integer
ks_clean['age'] = ks_clean['age'].astype('float64')

# display the data types of each column after conversion
ks_clean.info()

In [None]:
# group by sex and take aggregates of age
print(ks_clean.groupby('sex')['age'].agg(['mean', 'std', 'min', 'max', 'count']))



In [None]:
# group by sex and count unique values of age. Also, sort result by sex and then by age indexs in ascending order
print(ks_clean.groupby('sex')['age'].value_counts().sort_index(level=[0, 1], ascending=[True, True]))

# set pandas options to display all rows
pd.set_option('display.max_rows', None)

### Group by sex and count unique values of age in age ranges e.g; from 1-5, 6-10, etc.
# # Create age bins and count the number of values in each bin grouped by sex
# age_bins = pd.cut(ks_clean['age'], bins=np.arange(0, ks_clean['age'].max() + 5, 5))
# # print(age_bins)
# print(ks_clean.groupby(['sex', age_bins])['age'].count())


In [None]:
# draw scatter plot to visualize relationship between sex and age
plt.figure(figsize=(8, 5))
sns.stripplot(x='sex', y='age', data=ks_clean, jitter=True, palette='Set2', hue='sex')
plt.title('Scatter Plot of Age by Sex')
plt.xlabel('Sex')
plt.ylabel('Age')
plt.show()

In [None]:
sns.boxplot(x='sex', y='age', data=ks_clean)

In [None]:
sns.boxplot(y='age', data=ks_clean)

In [None]:
# draw distance plot of age to check normality of data
# Its also known as bell curve or distance plot
sns.displot(ks_clean['age'], kde=True)

In [None]:
# remove outliers of age
ks_clean = ks_clean[ks_clean['age'] < 68]  # Assuming 68 is an outlier threshold for age

# display the shape of the cleaned dataset after removing outliers
ks_clean.shape

In [None]:
# calculate the mean age after removing outliers
ks_clean['age'].mean()

In [None]:
# plot distribution of age after removing outliers
sns.displot(ks_clean['age'], kde=True)

In [None]:
# Draw boxplot to check outliers of all columns
ks_clean.boxplot()

In [None]:
# remove outliers of fare
ks_clean = ks_clean[ks_clean['fare'] < 300]  # Assuming 300 is an outlier threshold for fare

ks_clean.boxplot() # to check outliers of all columns after removing fare outliers

In [None]:
sns.displot(ks_clean['fare'], kde=True)  # plot distribution of fare after removing outliers.
# From graph we can see, data is not normally distributed, so we can use log transformation to normalize it. --> Data is right skewed


In [None]:
# log transformation of fare
ks_clean['fare_log'] = np.log(ks_clean['fare'] + 1)  # Adding 1 to avoid log(0) which is undefined


# draw boxplot using sns to check outliers of fare_log
sns.boxplot(x=ks_clean['sex'], y=ks_clean['fare_log'], hue=ks_clean['sex'])
plt.title('Boxplot of fare_log')

# draw boxplot to check outliers of fare
sns.catplot(x='sex', y='fare', hue='sex', data=ks_clean, kind='box')
plt.title('Boxplot of fare')

In [None]:
# plot histogram of all columns
ks_clean.hist() 

In [None]:
ks_clean['survived'].value_counts().plot(kind='bar', title='Survival Count')
# pd.Series(ks_clean['survived']).value_counts().plot(kind='bar', title='Survival Count') # same as above

In [None]:
# compare means of cleaned dataset with original dataset
print(ks.groupby(['sex', 'class'], observed=True).mean(numeric_only=True))
print(ks_clean.groupby(['sex', 'class'], observed=True).mean(numeric_only=True))

# Relation between data

In [None]:
corr_ks_clean = ks_clean.corr(numeric_only=True) # correlation matrix of cleaned dataset
corr_ks_clean

# if correlation is 1, it means that two columns are perfectly correlated. and if 1 increase then other will also increase.
# if correlation is -1, it means that two columns are perfectly negatively correlated. and if 1 increase then other will decrease.
# if correlation is 0, it means that two columns are not correlated.

In [None]:
# heatmap of correlation matrix
sns.heatmap(corr_ks_clean, annot=True)

In [None]:
# draw relation plot of age and fare to see how they are related
sns.relplot(x='age', y='fare', data=ks_clean, hue='sex', kind='scatter', height=6, aspect=1.5)