# **The Data Cleaning Process**

## Import Libraries

In [3]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv") # note, df is just a 
# variable name  which was used to store the variable inside
df.head()

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,GLA 250,2015.0,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,Accent,2013.0,Red,1.55,Nigerian Used,
2,Lagos,Lexus,GX 460 Premium,2011.0,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,ES 350,2011.0,Gray,4.95,Foreign Used,
4,Ibadan,Toyota,Verso 1.6,2009.0,Silver,1.69,Nigerian Used,118906.0


In [None]:
df.info()

## Remove irrelevant data


In [None]:
df.columns

In [None]:
# The PassengerId feature a unqiue ID for 
# the Passenger and so can be removed

df = df.drop("PassengerId", axis=1)
df.head()

In [None]:
# The Ticket feature can also be removed
df = df.drop("Ticket", axis=1)
df.head()

In [None]:
# confimed and make sure all the reqired 
# columns have been removed
df.columns

## Deduplicate your data

In [None]:
# check the size of your dataset so you can 
# keep track of any row or datapoint that was removed
df.shape

In [None]:
# Use the "Name" columns we will like to removed any
# rows with the same Name, that is, we will like to
# remove duplicate from our dataser if any

# sorting by first
df.sort_values("Name", inplace = True)
 
# dropping ALL duplicate values
df.drop_duplicates(subset ="Name",
                     keep = False, inplace = True)

In [None]:
# check the size againt to know 
# if any datapoint was removed
df.shape

In [None]:
# Since the number of rows in the dataset remained
# the same, therefore the dataset doesnot have any duplicate

## Fix structural errors

In [None]:
# The main of this section is to rename the different 
# class in our categorigal feature that were not properly named.
# or chanage the data type of a column

cat_features = {
    "Sex",
    "Embarked",
    "Pclass"
}

for cat_feature in cat_features:
  print(df[cat_feature].unique())

## Deal with missing data

In [None]:
# Firstky we idenity the colunms with
# missing values and their count

df.isnull().sum()

In [None]:
# Age, Cabin and Embarked all have missing values
# Cabin have more than 80% of its values missing so 
# we will have to remove it

df = df.drop("Cabin", axis=1)

In [None]:
# For Embarked we can replace the Nan value with the
# most frequently occuring class in that feature
mode_value = df["Embarked"].mode()[0]
print(mode_value)

df["Embarked"].fillna(mode_value, inplace=True)

In [None]:
# For Age we can replace the missing value with the mean Age
mean_value = df["Age"].mean()
print(mean_value)

df["Age"].fillna(mean_value, inplace=True)

## Filter out data outliers

In [None]:
# Using the Box plot we can check our dataset
# to know if we have any outliers
# We are checking the Fare column to see if
# any of the prices are expectional higher than
# the rest 

import seaborn as sns
sns.boxplot(df['Fare']);

In [None]:
#TODO: Remove the outlier based on the Fare Column