# Data cleaning strategies

In [1]:
#Mount the googledrive to access files
from google.colab import drive
drive.mount('/content/drive')
#Path of the file
#'/content/drive/MyDrive/......csv')


Mounted at /content/drive


## Titanic dataset

More info https://www.kaggle.com/c/titanic/data


In [13]:
# Question 1: How to read a dataset using Pandas?
import pandas as pd

#change path based on the dtaa location
dataset_path = '/content/drive/MyDrive/430031 notebooks/Topic 9/Data/titanic_modified.csv'
data = pd.read_csv(dataset_path)
# Display the first few rows of the dataframe
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_before_incident
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0


*The dataset is modified to introduce duplicate rows and columns*

## Check missing values and duplicates

In [16]:
# Question 2: How can you identify missing values in a Pandas DataFrame?

# Check for missing values in each column using .isnull() function in Pandas
missing_values_per_column = data.isnull().sum()

# Display the number of missing values per column
missing_values_per_column


PassengerId              0
Survived                 0
Pclass                   0
Name                     0
Sex                      0
Age                    178
SibSp                    0
Parch                    0
Ticket                   0
Fare                     0
Cabin                  690
Embarked                 2
age_before_incident    178
dtype: int64

In [18]:
# Check for again duplicate rows
duplicate_rows = data[data.duplicated()]

# Display the duplicate rows
print(duplicate_rows)

duplicate_columns = data.transpose().duplicated()
print(duplicate_columns)


     PassengerId  Survived  Pclass                              Name     Sex  \
891           16         1       2  Hewlett, Mrs. (Mary D Kingcome)   female   
892           17         0       3              Rice, Master. Eugene    male   
893           18         1       2      Williams, Mr. Charles Eugene    male   

      Age  SibSp  Parch  Ticket    Fare Cabin Embarked  age_before_incident  
891  55.0      0      0  248706  16.000   NaN        S                 55.0  
892   2.0      4      1  382652  29.125   NaN        Q                  2.0  
893   NaN      0      0  244373  13.000   NaN        S                  NaN  
PassengerId            False
Survived               False
Pclass                 False
Name                   False
Sex                    False
Age                    False
SibSp                  False
Parch                  False
Ticket                 False
Fare                   False
Cabin                  False
Embarked               False
age_before_incident

## Case 1: Guidelines to handle missing values, we can either impute or remove data?

**If only a small percentage of the data is missing, imputation might be a reasonable option. If a large portion is missing, removal might be more appropriate.**

In [20]:
#Question 3: # Calculate the percentage of missing values per column
missing_percentage_per_column = (data.isnull().sum() / len(data)) * 100

# Display the percentage of missing values per column
print("Percentage of missing values per column:")
print(missing_percentage_per_column)


Percentage of missing values per column:
PassengerId             0.000000
Survived                0.000000
Pclass                  0.000000
Name                    0.000000
Sex                     0.000000
Age                    19.910515
SibSp                   0.000000
Parch                   0.000000
Ticket                  0.000000
Fare                    0.000000
Cabin                  77.181208
Embarked                0.223714
age_before_incident    19.910515
dtype: float64


**Based on the threshold of 50%, cabin will be removed and age will be imputed**

In [22]:
# Drop the 'Cabin' column
data.drop('Cabin', axis=1, inplace=True)

# Impute missing values in the 'Age' column with the mean
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['age_before_incident'].fillna(data['age_before_incident'].mean(), inplace=True)

# Display the DataFrame after removing 'Cabin' and imputing 'Age'
print("DataFrame after removing 'Cabin' and imputing 'Age' and 'age_before_incident':")
print(data.head())

DataFrame after removing 'Cabin' and imputing 'Age' and 'age_before_incident':
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Embarked  age_before_incident  
0      0         A/5 21171   7.2500        S                 22.0  
1      0          PC 17599  71.2833        C                 38.0  
2      0  STON/O2. 3101282   7.9250  

## Case 2: do we use mean or median for imputing numerical data?

*Mean is usually used if the data is not skewed otherwise Median.*

In [None]:
# Impute missing values in the 'Age' column with the median
data['Age'].fillna(data['Age'].median(), inplace=True)
print(data.head())

## Case 3: Do we remove duplicates or they are good because it increase the size of the dataset?

*Duplicate entries may introduce inconsistencies or errors in the dataset. Removing duplicates ensures data accuracy and consistency.*

In [23]:
# Question 8: Remove duplicate rows and columns?
# Remove duplicate rows
data.drop_duplicates(inplace=True)

data = data.T.drop_duplicates().T

# Display the DataFrame after removing duplicate columns
print("\nDataFrame after removing duplicate rows and columns:")
print(data)


DataFrame after removing duplicate rows and columns:
    PassengerId Survived Pclass  \
0             1        0      3   
1             2        1      1   
2             3        1      3   
3             4        1      1   
4             5        0      3   
..          ...      ...    ...   
886         887        0      2   
887         888        1      1   
888         889        0      3   
889         890        1      1   
890         891        0      3   

                                                  Name     Sex        Age  \
0                              Braund, Mr. Owen Harris    male       22.0   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female       38.0   
2                               Heikkinen, Miss. Laina  female       26.0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female       35.0   
4                             Allen, Mr. William Henry    male       35.0   
..                                                 ...     ... 

In [24]:
#No more duplicates
data.transpose().duplicated()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Embarked       False
dtype: bool