# World Population (EDA)

## Steps to EDA

1. importing libraries

2. Loading Data (pd.read_csv("file.csv"))

3. data understanding/knowing

    3.1. df.shape

    3.2. df.head(10), df.tail(), df.sample()

    3.3. df.info() --> dtype, null

    3.4. df.describe() --> statistical overview

    3.5. numeric/cat_col = df.select_dtype(exclude/include="number"/"object")

    3.6.feature relations : Correlation, Mutual information, statistical tests

4. Remove Duplicate

5. Null Handling

6. Skewness handling

7. Outlier Handling

8. Filter data for advanced analysis

9. Scale the numeric data

10. Encode categorical data

11. Feature engineering

    11.1. Dimentionality reduction (e.g PCA, tSNE, UMAP)

    11.2. Manual featuring

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/AlexTheAnalyst/PandasYouTubeSeries/main/world_population.csv")

In [None]:
df.shape

In [None]:
#check the first 5 rows of data
#based on how you do that or how patience you are in this step could be insightful

df.head(10)

In [None]:
df.info()

In [None]:
df.describe().T #transpose
# df.describe(include="all").T ---> #df.describe() automatically excclude non-numerical features

In [None]:
df.describe(include=object).T

In [None]:
df.describe(include="all").T

In [None]:
all_cols = df.columns

In [None]:
numeric_cols = df.select_dtypes(include="number").columns
numeric_cols

In [None]:
cat_cols = df.select_dtypes(include = "object").columns
cat_cols

In [None]:
df[numeric_cols].corr().T

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df[numeric_cols].corr().T, annot=True)
plt.show()

In [None]:
# check dublicae
df.duplicated().sum()

In [None]:
#remove duplicate
# df.drop_duplicates(inplace=True)

In [None]:
#check nulls

# null --> nothing / missing / refer to nowhere
# NaN --> Not a number

df.isnull().sum()
# df.isna().sum

In [None]:
# nulls in whole dataframe
df.isnull().sum().sum()

In [None]:
# null managing

#Removal
df.dropna() #rows
df.dropna(axis=1) #columns --> not recommended

In [None]:
# impute null values

# numerical :
# mean (highly affected by outliers)--> df.fillna(df["col"].mean)
# median (better option) --> df.fillna(df["col"].median)

# categorigcal --> mode

# fill with a constant:
df.fillna(0)
df5 = df[numeric_cols].fillna(df[numeric_cols].median)
df6 = df[cat_cols].fillna(df[cat_cols].mode())

In [None]:
from pandas.core.arrays import numeric

# better option for numeric cols -----"K nearest neighbors" (KNN imputer)
from sklearn.impute import KNNImputer
KNN_imputer = KNNImputer(n_neighbors=5)
df7 = KNN_imputer.fit_transform(df[numeric_cols]) # KNN generates numpy series so we need to convert it to dataframe
pd.DataFrame(df7, columns=numeric_cols)

In [None]:
df8 = pd.concat((df[cat_cols], pd.DataFrame(df7)), axis=1) # instead of df7 we use pd.DataFrame(df7)
df8.isnull()

In [None]:
# Skewness analysis
sns.distplot(df[numeric_cols])
plt.show()

In [None]:
df['1970 Population'].skew()

In [None]:
df.sort_values(by='1970 Population', ascending=False).head(10)