In [126]:
import pandas as pd

In [127]:
dataframe = pd.read_csv("datasets/students-scores-corrupted-data.csv")
dataframe.head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,72,72.0,74.0
1,,69,90.0,88.0
2,female,90,95.0,93.0
3,male,47,,44.0
4,male,sixty-three,,75.0


In [128]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         998 non-null    object 
 1   math score     1000 non-null   object 
 2   reading score  997 non-null    float64
 3   writing score  997 non-null    float64
dtypes: float64(2), object(2)
memory usage: 31.4+ KB


In [129]:
# coerce numeric columns data into numbers
# string values will be replaced with NaN
# math score Dtype will change from object to float64

for column in dataframe[["math score", "reading score", "writing score"]]:
  dataframe[column] = pd.to_numeric(dataframe[column], errors="coerce")

dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         998 non-null    object 
 1   math score     998 non-null    float64
 2   reading score  997 non-null    float64
 3   writing score  997 non-null    float64
dtypes: float64(3), object(1)
memory usage: 31.4+ KB


In [130]:
# drop rows with NaN values

df = dataframe.dropna(subset=["reading score"])
df.head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,72.0,72.0,74.0
1,,69.0,90.0,88.0
2,female,90.0,95.0,93.0
6,,88.0,95.0,92.0
7,male,,43.0,39.0


In [131]:
# replace NaN values with mean value

dataframe.head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,72.0,72.0,74.0
1,,69.0,90.0,88.0
2,female,90.0,95.0,93.0
3,male,47.0,,44.0
4,male,,,75.0


In [132]:
# replace NaN values
# gender column is non numeric data, replace NaN with unknown

dataframe["gender"] = dataframe["gender"].fillna("unknown")
dataframe.head(10)

Unnamed: 0,gender,math score,reading score,writing score
0,female,72.0,72.0,74.0
1,unknown,69.0,90.0,88.0
2,female,90.0,95.0,93.0
3,male,47.0,,44.0
4,male,,,75.0
5,female,71.0,,500.0
6,unknown,88.0,95.0,92.0
7,male,,43.0,39.0
8,male,64.0,64.0,67.0
9,female,38.0,60.0,50.0


In [133]:
# geneder column now has no null values

dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         1000 non-null   object 
 1   math score     998 non-null    float64
 2   reading score  997 non-null    float64
 3   writing score  997 non-null    float64
dtypes: float64(3), object(1)
memory usage: 31.4+ KB


In [134]:
# replace NaN values in numeric columns with the mean

for column in dataframe[["math score", "reading score", "writing score"]]:
  dataframe[column] = dataframe[column].fillna(dataframe[column].mean())

dataframe.head(10)

Unnamed: 0,gender,math score,reading score,writing score
0,female,72.0,72.0,74.0
1,unknown,69.0,90.0,88.0
2,female,90.0,95.0,93.0
3,male,47.0,69.158475,44.0
4,male,66.10521,69.158475,75.0
5,female,71.0,69.158475,500.0
6,unknown,88.0,95.0,92.0
7,male,66.10521,43.0,39.0
8,male,64.0,64.0,67.0
9,female,38.0,60.0,50.0


In [135]:
# all columns now has no null values

dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         1000 non-null   object 
 1   math score     1000 non-null   float64
 2   reading score  1000 non-null   float64
 3   writing score  1000 non-null   float64
dtypes: float64(3), object(1)
memory usage: 31.4+ KB


In [136]:
# find outlier values

for column in dataframe[["math score", "reading score", "writing score"]]:
  print(column, (dataframe[column] > 100).sum())

math score 0
reading score 0
writing score 2


In [138]:
# where method replace values where the condition is False
# replace with 75 if value not smaller than or equal 100

dataframe["writing score"] = dataframe["writing score"].where(dataframe["writing score"] <= 100, 75)

for column in dataframe[["math score", "reading score", "writing score"]]:
  print(column, (dataframe[column] > 100).sum())

math score 0
reading score 0
writing score 0
