In [1]:
import pandas as pd

In [2]:
# Sample dataset
data = {
    'Name':['Jennie','Lalisa','Rose',None,'Jisoo'],
    'Age':[25,None,30,22,None],
    'City':['KL','Penang',None,'Johor','KL']
}

In [3]:
df=pd.DataFrame(data)
print('Original Dataframe:')
df

Original Dataframe:


Unnamed: 0,Name,Age,City
0,Jennie,25.0,KL
1,Lalisa,,Penang
2,Rose,30.0,
3,,22.0,Johor
4,Jisoo,,KL


# Check missing values row by row

In [4]:
for index, row in df.iterrows():
    print(f'Row {index}: Missing = {row.isnull().any()}, Details = {row.isnull().to_dict()}')

Row 0: Missing = False, Details = {'Name': False, 'Age': False, 'City': False}
Row 1: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}
Row 2: Missing = True, Details = {'Name': False, 'Age': False, 'City': True}
Row 3: Missing = True, Details = {'Name': True, 'Age': False, 'City': False}
Row 4: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}


# Show only rows with missing data

In [5]:
missing_rows = df[df.isnull().any(axis=1)]
print('Rows with missing data: ')
missing_rows

Rows with missing data: 


Unnamed: 0,Name,Age,City
1,Lalisa,,Penang
2,Rose,30.0,
3,,22.0,Johor
4,Jisoo,,KL


# Simulate rows that would be dropped using dropna()

In [6]:
to_drop = df[df.isnull().any(axis=1)]
print('These rows would be dropped using dropna(): ')
to_drop

These rows would be dropped using dropna(): 


Unnamed: 0,Name,Age,City
1,Lalisa,,Penang
2,Rose,30.0,
3,,22.0,Johor
4,Jisoo,,KL


In [7]:
df_cleaned = df.dropna()
print('After dropna(): ')
df_cleaned

After dropna(): 


Unnamed: 0,Name,Age,City
0,Jennie,25.0,KL


In [8]:
print('Before fillna(): ')
df

Before fillna(): 


Unnamed: 0,Name,Age,City
0,Jennie,25.0,KL
1,Lalisa,,Penang
2,Rose,30.0,
3,,22.0,Johor
4,Jisoo,,KL


In [9]:
df_filled = df.fillna({
    'Name':'Unknown',
    'Age':df['Age'].mean(),
    'City':'Not Available'
})

In [10]:
print('After fillna(): ')
df_filled

After fillna(): 


Unnamed: 0,Name,Age,City
0,Jennie,25.0,KL
1,Lalisa,25.666667,Penang
2,Rose,30.0,Not Available
3,Unknown,22.0,Johor
4,Jisoo,25.666667,KL


In [13]:
df_sfill = df.fillna({
    'Name':'Unknown',
    'Age':round(df['Age'].mean(),2),
    'City':'Not Available'
})
    

In [14]:
print('After fillna(): Age up to two decimal: ')
df_sfill

After fillna(): Age up to two decimal: 


Unnamed: 0,Name,Age,City
0,Jennie,25.0,KL
1,Lalisa,25.67,Penang
2,Rose,30.0,Not Available
3,Unknown,22.0,Johor
4,Jisoo,25.67,KL


# test pakai import math

In [15]:
import math

In [17]:
df_stest = df.fillna({
    'Name':'Unknown',
    'Age':math.floor(df['Age'].mean()),
    'City':'Not Available'
})
df_stest

Unnamed: 0,Name,Age,City
0,Jennie,25.0,KL
1,Lalisa,25.0,Penang
2,Rose,30.0,Not Available
3,Unknown,22.0,Johor
4,Jisoo,25.0,KL


In [20]:
#or like this
df_new=df.fillna({
    'Name':'unknown',
    'Age': math.floor(df['Age'].mean().astype(int)),
    'City': 'not available'
})
df_new

Unnamed: 0,Name,Age,City
0,Jennie,25.0,KL
1,Lalisa,25.0,Penang
2,Rose,30.0,not available
3,unknown,22.0,Johor
4,Jisoo,25.0,KL


## Data Cleaning Exercise
    1. Download dataset
    2. Get basic dataset info
    3: Print('Missing values per column:')
    4: Check missing data line by line
    5: Drop missing rows (if any)
    6: Compare before and after
    7: Fill missing values
    8: Compare before and after

In [39]:
df = pd.read_csv('students_performance_dirty.csv')
df.head()

Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          52 non-null     object 
 1   study_hours     52 non-null     float64
 2   attendance_pct  60 non-null     float64
 3   math_score      60 non-null     float64
 4   reading_score   60 non-null     float64
 5   final_score     60 non-null     float64
dtypes: float64(5), object(1)
memory usage: 2.9+ KB


In [41]:
print("Missing data per column:")
df.isnull().sum()

Missing data per column:


gender            8
study_hours       8
attendance_pct    0
math_score        0
reading_score     0
final_score       0
dtype: int64

In [47]:
for index, row in df.iterrows():
    print(f'Row {index}: Missing = {row.isnull().to_dict()}')

Row 0: Missing = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 1: Missing = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 2: Missing = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 3: Missing = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 4: Missing = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 5: Missing = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 6: Missing = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 're

In [43]:
missing = df[df.isnull().any(axis=1)]
print('Rows with missing data: ')
missing.head(5)

Rows with missing data: 


Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
1,,2.6,91.2,82.0,60.0,37.5
6,male,,87.1,95.0,78.0,43.4
7,femle,,70.6,59.0,70.0,32.8
11,,4.6,86.2,63.0,78.0,36.9
12,femle,,95.0,59.0,92.0,39.7


In [44]:
drop_out = df[df.isnull().any(axis=1)]
print('These rows would be dropped using dropna(): ')
drop_out.head(5)

These rows would be dropped using dropna(): 


Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
1,,2.6,91.2,82.0,60.0,37.5
6,male,,87.1,95.0,78.0,43.4
7,femle,,70.6,59.0,70.0,32.8
11,,4.6,86.2,63.0,78.0,36.9
12,femle,,95.0,59.0,92.0,39.7


In [45]:
cleaned_df = df.dropna()
print('After dropna(): ')
cleaned_df.head(5)

After dropna(): 


Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
5,male,1.4,93.7,77.0,72.0,39.0


In [46]:
df['gender'] = df['gender'].fillna('Unavailable')
df['study_hours'] = df['study_hours'].fillna(df['study_hours'].mean())
print('After fillna(): ')
df.head(5)

After fillna(): 


Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,Unavailable,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
