# EDA template

## Packages import

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# extras if you want to concatenate nultiple files into a df
import os 
import glob

## Defining useful functions

In [None]:
def null_vals(dataframe):
    '''function to show both number of nulls and the percentage of nulls in the whole column'''
    null_vals = dataframe.isnull().sum()
    total_cnt = len(dataframe)
    null_vals = pd.DataFrame(null_vals,columns=['null'])
    null_vals['percent'] = round((null_vals['null']/total_cnt)*100,3)
    return null_vals.sort_values('percent', ascending=False)

In [None]:
#remove price outliars in every year
def drop_outliers(df, column):
    """Function that drops outliers for year to year"""
    for x in df.age.unique(): #we loop through each unique year
        for col in column: #We loop through the different columns

            #Select the data between th 25th and 75th percentile
            q75,q25 = np.percentile(df[df.age == x].loc[:,col],[75,25]) ########
            intr_qr = q75-q25

            # We extend the range of the upper and lower bound
            upper = q75+(1.5*intr_qr)
            lower = q25-(1.5*intr_qr) 

            df.loc[((df[col] < lower) | (df[col] > upper)) 
            & (df.age == x), col] = np.nan
            
    df = df.dropna(axis = 0, subset=column).reset_index(drop=True)
    
    
    return df

## Data import

In [None]:
# Load in the data
df = pd.read_csv('*.csv')

# If the size of the file is very big you can use this instead
chunk = pd.read_csv('*.csv',chunksize=1000000)
df = pd.concat(chunk) # concatenate chucks of file

# if we want to make a dataframe from multiple files
# Import all csv files in bulks

filenames=[]
dfs=[]
directory_path = '.'

for path in Path(directory_path).rglob('*.csv'):
    filenames.append(str(path.absolute()))
    dfs.append(pd.read_csv(str(path.absolute())))#,encoding='mbcs'))

# Create the dataframe from the list dfs
df=pd.concat(dfs, ignore_index=True)

## Data inspection

In [None]:
# read first 5 lines
df.head()

In [None]:
# read last five lines
df.tail()

In [None]:
# number of rows and columns
df.shape

print(f'The dataframe has {df.shape[0]} rows and {df.shape[1]} columns.')

In [None]:
# data types of each column
df.dtypes

In [None]:
# checking if nulls are present
null_vals(df)

In [None]:
# count number of items in a particular column
df['column'].value_counts(dropna = False)

## Data Cleaning

In [None]:
# make column name lower case and remove spaces from the column names
df.rename(columns=str.lower,inplace = True)
df.columns = df.columns.str.replace(' ', '_', regex=False)

In [None]:
# filling NAs with 0s 
df['column'].fillna(0, inplace=True)

In [None]:
#replace missing nulls with mean Age
df['column'] = df.groupby('column')['Age'].transform(lambda x: x.fillna(x.mean()))

## Data Summary

In [None]:
df.describe()

In [None]:
df.summary()

## Datecting outliers

In [None]:
sns.boxplot(data=df)

In [None]:
df= drop_outliers(df, ['column1', 'column2'])

## Data correlations and Heatmap

In [None]:
# correlations between numerical features
df.corr()

In [None]:
# visualising correlations as heatmap
plt.figure(figsize = (10, 10))
sns.heatmap(df.corr(), annot = True)

plt.show()

In [None]:
# visualising correlations as scatter matrices
pd.plotting.scatter_matrix(df, figsize=(20,20))
plt.show()

In [None]:
sns.pairplot(df[['column1', 'column2']], height = 6)
plt.show()