# Step-1 Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step-2 Read and understand the dataset

In [None]:
#Loading the dataset
df = pd.read_csv("C:/Users/Priyanka\Desktop/DS/Project_5_Data_Cleaning-2/Dataset/Life Expectancy Data.csv")

In [None]:
df.head()

In [None]:
df.tail()

# Step-3 Sanity Checks

In [None]:
#Shape1
df.shape

In [None]:
#Find the missing values.
df.isnull().sum()

In [None]:
#Missing value percentage.
(df.isnull().sum()/df.shape[0])*100

In [None]:
#Find duplicate rows.
df.duplicated().sum()

In [None]:
#Finding the garbage values.
#This can be done by printing unique vals of each col.
for i in df.select_dtypes(include='object').columns: #Took cols having dtpye obj bcoz other dtype will replace garbages with N/A or null
    print("Column", i)
    print(df[i].value_counts())
    print("***"*15) #Separator.

#If garbage value would have been presnet then it would be like-> " "   7

# Step-4 EDA

In [None]:
#Descriptive stats.
df.describe() #This give stats about our data.

In [None]:
#For object we can also use:-
df.describe(include='object')
#This is a diff kind of stats other than above for objs.

In [None]:
#Histogram to understand the distribution.
sns.set_theme(style="darkgrid", palette="pastel") 
for i in df.select_dtypes(include='number').columns:
    sns.histplot(data = df, x=df[i])
    plt.show()

#Many of the hist plots are positively skewed which means no normal distribution.

In [None]:
#Box plot to identify the outliers.
sns.set_theme(style="whitegrid", palette="muted") 
for i in df.select_dtypes(include='number').columns:
    sns.boxplot(data = df, x=df[i])
    plt.show()

#We can see how many cols have major outliers.

In [None]:
#Scatter plot for understanding the relationship.
#Lets understand the relation of the cols with the life expectancy col.
df.select_dtypes("number").columns

In [None]:
cols = ['Year' , 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling']

In [None]:
sns.set_theme(style="darkgrid", palette="muted") 
for i in cols:
    sns.scatterplot(data = df, x = i, y = 'Life expectancy ')
    plt.show()

In [None]:
#Corelation with heatmap to iterpret the relation and multi-collinearity.
s = df.select_dtypes(include='number').corr()
plt.figure(figsize=(15, 15))
sns.heatmap(s, annot=True)

# Step-5 Missing Values Treatment.

In [None]:
#Choose the method to input the missing values
#Like which one to use from mean, median, mode or KNNImputer to fill the missing values.
df.isnull().sum()

In [None]:
#Main point is that the target variable is the life expectancy so we dont do the operation on it.
#Like we dont fill the missing vals by mean, med or mode.
#Lets fill some cols using mean
# cols = ['Year' , 'Adult Mortality', 'infant deaths',
#        'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
#        'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
#        ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
#        ' thinness 5-9 years', 'Income composition of resources', 'Schooling']
for i in [" BMI ", "Polio"]:
    df[i].fillna(df[i].median())

In [None]:
from sklearn.impute import KNNImputer

def knn_impute(df, cols, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[cols] = imputer.fit_transform(df[cols])
    return df

In [None]:
# Health & Life indicators
health_cols = ['Adult Mortality', ' BMI ',
               'Hepatitis B', 'Polio', 'Diphtheria ', 
               'under-five deaths ', 'infant deaths', ' HIV/AIDS']

df = knn_impute(df, health_cols)

# Economy & Living standards
economic_cols = ['GDP', 'Income composition of resources', 'Schooling',
                 'percentage expenditure', 'Total expenditure', 
                 'Alcohol', 'Population']

df = knn_impute(df, economic_cols)

# Nutrition
nutrition_cols = [' thinness  1-19 years', ' thinness 5-9 years']
df = knn_impute(df, nutrition_cols)

In [None]:
df.isnull().sum()

# Step-6 Outlier Treatments.

### **What are Outliers?**  
Outliers are **values that are very different** (too high or too low) compared to most data points.  
They can **distort averages and model performance**.  

**Example:**  
`[10, 12, 11, 13, 95]` → here **95** is an outlier.  

---

### **What are Whiskers?**  
In a **boxplot**, whiskers are the **lines that extend from the box** up to:
- **Lower whisker = Q1 - 1.5 × IQR**  
- **Upper whisker = Q3 + 1.5 × IQR**  
Values outside whiskers = **outliers**  

---

### **Should You Remove Outliers?**  
- ✅ Remove if they are **errors or irrelevant** (e.g., height = 900 cm).  
- ❌ Keep if they are **genuine** and meaningful (e.g., very rich person in income data).  

**Tip:**  
Check **domain knowledge** before removing!  


In [None]:
#Decide to do the outlier treatement or not!
#Outlier treatments is only done on the continuous varibale.
#It is not done on the Target Var, Discrete Var and Categorial Var.


In [None]:
#Steps to do outlier treatment.

#1. We have to get the lower wisker and upper wisker.
#2. The points lesser than the lower wisker will be replaced by the lower wisker.
#3. And the points greater than the upper wisker will be replaced by the upper wisker.

def wisker(col):
    q1, q3 = np.percentile(col, [25, 75])
    iqr = q3-q1
    lw = q1 - 1.5 * iqr
    uw = q3 + 1.5 * iqr
    return lw, uw

In [None]:
df.columns

In [None]:
wisker(df['GDP']) #Check

In [None]:
#Taking GDP as example for before and after.
sns.boxplot(data = df, x = df['GDP'])

In [None]:
for i in ['GDP','Total expenditure' , ' thinness  1-19 years' ,' thinness 5-9 years' ]:
    lw, uw = wisker(df[i])
    df[i] = np.where(df[i]<lw, lw, df[i])
    df[i] = np.where(df[i]>uw, uw, df[i])

In [None]:
for i in ['GDP','Total expenditure' , ' thinness  1-19 years' ,' thinness 5-9 years' ]:
    sns.boxplot(data = df, x = df[i]);
    plt.show()

# Step-7 Duplicate and Garbage value treatment.

In [None]:
#Clean the duplicates
df.duplicated().sum()

In [None]:
#Since no duplicate is there we wont do it. To remove dups run df.drop_duplicates()

In [None]:
#Cleanup the garbage values.
#There is no garbage values as we saw above. 
#If that would have been there we would have replaced it by medan.



# Step-8 Encoding of Data

In [None]:
#Now to give this data to the model we should have to convert all the cols in numberical.
#Data should be numerical in order to feed it to the model.
#So the conversion of the data cols from categorial to numerical is encoding. 

#Two methods Dummy and Label
dummy = pd.get_dummies(data = df, columns = ["Country", "Status"], drop_first = True)

In [None]:
dummy

In [None]:
#Now the data is ready to feed the model.
dummy.info()

In [None]:
dummy.to_csv("cleaned_data.csv", index=False)