# Data Cleaning

In this notebook we are going to attempt to clean the dataset shown in the 'data' folder.

Importing Libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Loading Data:

In [None]:
df = pd.read_csv('../data/StudentsPerformance.csv')

Inspecting the data:

In [None]:
# Checking the first few rows of the dataframe
df.head()

# Checking the data types and non-null counts
df.info()

# Getting some basic statistics
df.describe()

# Checking the rows and columns
df.shape

Handling missing values:

In [None]:
# Displaying the total number of missing values in each column
df.isnull().sum()

# Dropping rows/columns with too many missing values
df = df.dropna(thresh=len(df)*0.7, axis=1)  # Dropping columns with more than 30% missing values

# Filling missing values
df = df.fillna(df.median(numeric_only=True))  # Filling numeric columns with median
df = df.fillna(df.mode().iloc[0])  # Filling categorical columns with mode
df = df.fillna('Unknown')  # Filling remaining missing values with 'Unknown'

# Checking there are no more missing values
df.isnull().sum()

Removing duplicates:

In [None]:
# Checking for duplicates
duplicates = df.duplicated().sum()
print(f'Total duplicates: {duplicates}')

# Removing duplicates
df = df.drop_duplicates()

# Final check
df.info()

Correcting data types:

In [None]:
# Converting data types if necessary
df.info()

# Specifically looking at the numerical columns
print(df.select_dtypes(include=['number']).head())

# Specifically looking at the onject columns
print(df.select_dtypes(include=['object']).head())

# Converting object columns to string type
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('string')
df.info()

Handling outliers:

In [None]:
# Working out the IQR (Interquartile Range)
Q1 = df.select_dtypes(include=['number']).quantile(0.25)
Q3 = df.select_dtypes(include=['number']).quantile(0.75)
IQR = Q3 - Q1
print(IQR)

# Identifying outliers based on IQR
outliers = ((df.select_dtypes(include=['number']) < (Q1 - 1.5 * IQR)) | (df.select_dtypes(include=['number']) > (Q3 + 1.5 * IQR)))

# Printing the number of outliers in each numerical column
print(outliers.sum())

# Handling outliers (e.g., removing or capping them)
df = df[~outliers.any(axis=1)] # Removing rows with outliers
df.info()

Cleaning Strings

In [None]:
# Removing white spaces from string columns
for col in df.select_dtypes(include=['string']).columns:
    df[col] = df[col].str.strip()


# Converting strings to lowercase
for col in df.select_dtypes(include=['string']).columns:
    df[col] = df[col].str.lower()


# Replacing or removing unwanted characters in strings
for col in df.select_dtypes(include=['string']).columns:
    df[col] = df[col].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
df.info()