In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from google.colab import files

In [5]:
uploaded = files.upload()

Saving student-dataset (1).csv to student-dataset (1).csv


In [6]:
import io

In [7]:
df = pd.read_csv(io.BytesIO(uploaded['student-dataset (1).csv']), delimiter=',', on_bad_lines='skip')
# This will skip any lines that cause parsing errors.

In [8]:
# Handling Missing Values

# Option 1: Remove rows with missing values
df_cleaned = df.dropna()

In [9]:
# Option 2: Fill missing values with the mean (for numerical columns)
df_filled = df.fillna(df.select_dtypes(include=np.number).mean()) # Select only numeric columns for calculating the mean

In [10]:
# Option 3: Fill missing values with the median (for numerical columns)
df_filled_median = df.fillna(df.select_dtypes(include=np.number).median()) # Select numeric columns for median calculation

In [11]:
# Option 4: Fill missing values with the mode (for categorical columns)
df_filled_mode = df.apply(lambda x: x.fillna(x.mode()[0]) if x.dtype == 'O' else x)

In [12]:
# Handling Outliers

# Using the Interquartile Range (IQR) method
Q1 = df.select_dtypes(include=np.number).quantile(0.25) # Select numeric columns for quantile calculations
Q3 = df.select_dtypes(include=np.number).quantile(0.75) # Select numeric columns for quantile calculations
IQR = Q3 - Q1

In [13]:
# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [14]:
# Identify outliers
outliers = ((df.select_dtypes(include=np.number) < lower_bound) | (df.select_dtypes(include=np.number) > upper_bound))

In [15]:
# Option 1: Remove outliers
df_no_outliers = df[~((df.select_dtypes(include=np.number) < lower_bound) | (df.select_dtypes(include=np.number) > upper_bound)).any(axis=1)]

In [16]:
# Option 2: Cap outliers to the lower and upper bounds
df_capped = df.copy()
numeric_df = df.select_dtypes(include=np.number) # Select numeric columns for comparison

for col in numeric_df.columns:
    df_capped.loc[numeric_df[col] < lower_bound[col], col] = lower_bound[col]
    df_capped.loc[numeric_df[col] > upper_bound[col], col] = upper_bound[col]

  df_capped.loc[numeric_df[col] < lower_bound[col], col] = lower_bound[col]
  df_capped.loc[numeric_df[col] < lower_bound[col], col] = lower_bound[col]


In [17]:
# Option 3: Impute outliers with mean/median
df_imputed_outliers = df.copy()
df_imputed_outliers[outliers] = np.nan

# Calculate the mean only for numeric columns
numeric_df = df.select_dtypes(include=np.number)
df_imputed_outliers = df_imputed_outliers.fillna(numeric_df.mean())

In [18]:
print("Original DataFrame:\n", df)
print("DataFrame after handling missing values and outliers:\n", df_imputed_outliers)

Original DataFrame:
       id               name               nationality                 city  \
0      0          Kiana Lor                     China               Suzhou   
1      1     Joshua Lonaker  United States of America        Santa Clarita   
2      2      Dakota Blanco  United States of America              Oakland   
3      3    Natasha Yarusso  United States of America        Castro Valley   
4      4     Brooke Cazares                    Brazil  São José dos Campos   
..   ...                ...                       ...                  ...   
302  302        Austin Haas  United States of America             Columbus   
303  303    Madison Fithian  United States of America          Los Angeles   
304  304  Zachary Mulvahill  United States of America          Los Angeles   
305  305   Eliana Michelsen  United States of America              Oakland   
306  306    Dane Whittemore                    Canada              Toronto   

     latitude  longitude gender  ethnic.gr