# Data Cleaning

<details><summary>Content</summary>

Handling missing data: .isnull(), .notnull(), .dropna(), .fillna(), .ffill(), .bfill()

Duplicates: .duplicated(), .drop_duplicates()

Replacing values: .replace()

Data type conversion: .astype()

Renaming: .rename(), setting column names directly

String operations: .str accessor methods

What to add:

Dealing with outliers

Converting dates: pd.to_datetime()

Categorical data: .astype('category')

.clip() for value bounds

.unique(), .nunique(), .value_counts()
</details>

<details><summary>Data Cleaning methods</summary>

`df.columns = ['a', 'b', 'c']` # Renames columns

`df.isnull()` # checks for null Values, Returns Boolean Array

`df.notnull()` # Opposite of pd.isnull()

`df.dropna()` # Drops all rows that contain null values

`df.dropna(axis=1)` # Drops all COLUMNS that contain null values

`df.dropna(axis=1, thresh=n)` # Drops all rows have have less than n non-null values

`df.fillna(x)` # Replaces all null values with x

`df.fillna(s.mean())` # Replaces all null values with the mean, median etc.

`df.astype(float)` # Converts the datatype of the Series to float

`df.replace(1, 'one')` # Replaces all values equal to 1 with one

`df.replace([1, 3], ['one','three'])` # Replaces all 1 with 'one' and 3 with 'three'

`df.columns.str.replace("grade_", "")` # renames columns

`df.rename(columns=lambda x: x + 1)` # Mass renaming of columns

`df.rename(columns={'old_name': 'new_name'})` # Selective renaming of columns

`df.rename(index=lambda x: x + 1)` # Mass renaming of index

`df.set_index('column_one')` # Selectively sets the index

</details>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("https://drive.google.com/uc?id=1oE-3rt17bFW7fOzDIjwFSEMPTIV3NvcO")
df[:3]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23


In [4]:
# len(df) # no. of rows
# df.describe()  # Basic descriptive statistics for each column (or GroupBy)
# df.info()
# df.shape
# df.dtypes # check data types in df
# df.school_id.nunique()
# df.school_id.unique()
df.school_id.value_counts()  # count number of rows per school_id

school_id
499    161
262    158
426    156
812    153
881    153
57     151
946    150
141    146
60     143
458    139
Name: count, dtype: int64

## Inspection

In [5]:
df.columns
df.shape
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   school_id          1510 non-null   int64  
 1   grade              1510 non-null   int64  
 2   class              1510 non-null   object 
 3   student_id         1510 non-null   object 
 4   sex                1510 non-null   int64  
 5   nationality        1510 non-null   int64  
 6   grade_math_t1      1510 non-null   float64
 7   grade_language_t1  1510 non-null   float64
 8   grade_science_t1   1510 non-null   float64
 9   grade_math_t2      1508 non-null   float64
 10  grade_language_t2  1507 non-null   float64
 11  grade_science_t2   1508 non-null   float64
 12  treatment          1510 non-null   int64  
 13  date_of_birth      1510 non-null   object 
dtypes: float64(6), int64(5), object(3)
memory usage: 165.3+ KB


In [None]:
df.groupby("school_id").count() # count of row per school
round(df_a.math_t1.mean(), 2)
df.math_t2.agg(["min", "max"])
df.math_t2.max()
df.math_t2.sort_values().tail(10)
df.math_t2.sort_values(ascending=False).head(10)
df.language_t1.sort_values(ascending=False).tail(10)
df.language_t2.sort_values(ascending=False).head(10)

## replace()

In [6]:
df.columns = df.columns.str.replace("grade_", "")
df

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04
1506,946,8,D,ca1dg@cunb.edu,1,1,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23
1507,946,8,D,amdrx@cunb.edu,1,1,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15
1508,946,8,D,yn5ug@cunb.edu,1,2,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18


## astype()

In [7]:
# reduce memory usage - spot categories
df.treatment.unique()
df.sex.unique()
df.nationality.unique()
df.grade.unique()
df["class"].unique()


array(['A', 'B', 'C', 'D'], dtype=object)

In [10]:
df_a = df.copy() # work on copy of df

# transform columns to category dtype
df_a[["grade", "class", "nationality", "sex", "treatment"]] = df_a[["grade", "class", "nationality", "sex", "treatment"]].astype("category")

df_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   school_id      1510 non-null   int64   
 1   grade          1510 non-null   category
 2   class          1510 non-null   category
 3   student_id     1510 non-null   object  
 4   sex            1510 non-null   category
 5   nationality    1510 non-null   category
 6   math_t1        1510 non-null   float64 
 7   language_t1    1510 non-null   float64 
 8   science_t1     1510 non-null   float64 
 9   math_t2        1508 non-null   float64 
 10  language_t2    1507 non-null   float64 
 11  science_t2     1508 non-null   float64 
 12  treatment      1510 non-null   category
 13  date_of_birth  1510 non-null   object  
dtypes: category(5), float64(6), int64(1), object(2)
memory usage: 114.4+ KB


In [29]:
# memory reduction in KB
(df.memory_usage(deep=True).sum() - df_a.memory_usage(deep=True).sum())/1024 

np.float64(112.689453125)

## to_datetime

In [56]:
df_a["date_of_birth"][:2]
df_a["date_of_birth"].isna().sum()

df_a[df_a["date_of_birth"].isna()]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,year
124,57,8,B,gtvmk@cunb.edu,1,1,7.954596,7.414694,7.036728,7.505426,6.965524,6.587558,0,NaT,
548,262,8,A,yyhzg@cunb.edu,1,1,7.930281,7.40337,6.983867,8.950993,8.424081,8.004578,1,NaT,
743,426,8,D,iya1k@cunb.edu,2,1,7.080474,5.793896,6.894132,6.068677,4.782099,5.882335,1,NaT,
1005,499,8,A,otbkf@cunb.edu,1,1,4.333835,5.1414,5.548819,4.405107,5.212672,5.620091,1,NaT,
1396,946,6,C,yofqa@cunb.edu,2,1,7.698935,7.704067,6.331823,7.373213,7.378346,6.006101,1,NaT,


In [72]:
df_a[df_a.grade == 8]["date_of_birth"]

99     1995-06-28
100    1995-04-01
101    1995-05-24
102    1995-01-22
103    1994-09-25
          ...    
1505   1995-07-04
1506   1995-08-23
1507   1994-12-15
1508   1994-09-18
1509   1994-12-19
Name: date_of_birth, Length: 533, dtype: datetime64[ns]

## impute birthdate by grade

In [74]:
avg_birthdate_8_grade = df_a[df_a.grade == 8]["date_of_birth"].mean()
avg_birthdate_6_grade = df_a[df_a.grade == 6]["date_of_birth"].mean()
avg_birthdate_8_grade
avg_birthdate_6_grade

Timestamp('1997-02-24 04:35:17.647058816')

## fillna()

In [None]:
df_a.loc[df_a.grade == 8, "date_of_birth"] = df_a.loc[
    df_a.grade == 8, "date_of_birth"
].fillna(avg_birthdate_8_grade)

In [None]:
df_a.loc[df_a.grade == 6, "date_of_birth"] = df_a.loc[
    df_a.grade == 6, "date_of_birth"
].fillna(avg_birthdate_6_grade)

In [None]:
# filled
df.loc[124] 
df.loc[1396]

school_id                   946
grade                         6
class                         C
student_id       yofqa@cunb.edu
sex                           2
nationality                   1
math_t1                7.698935
language_t1            7.704067
science_t1             6.331823
math_t2                7.373213
language_t2            7.378346
science_t2             6.006101
treatment                     1
date_of_birth        1997.03.09
Name: 1396, dtype: object

In [38]:
df_a["date_of_birth"] = pd.to_datetime(
    df_a["date_of_birth"],
    format="%Y-%m-%d",  # Adjust to your actual format
    errors="coerce",
)
df_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   school_id      1510 non-null   int64         
 1   grade          1510 non-null   category      
 2   class          1510 non-null   category      
 3   student_id     1510 non-null   object        
 4   sex            1510 non-null   category      
 5   nationality    1510 non-null   category      
 6   math_t1        1510 non-null   float64       
 7   language_t1    1510 non-null   float64       
 8   science_t1     1510 non-null   float64       
 9   math_t2        1508 non-null   float64       
 10  language_t2    1507 non-null   float64       
 11  science_t2     1508 non-null   float64       
 12  treatment      1510 non-null   category      
 13  date_of_birth  1505 non-null   datetime64[ns]
dtypes: category(5), datetime64[ns](1), float64(6), int64(1), object(1)
memor

In [80]:
df_a["year"] = df_a["date_of_birth"].dt.year

df_a["year"] = df_a["date_of_birth"].dt.year.astype("int64")
df_a

print(df_a["year"].dtype)  # Should output: int64

df_a

int64


Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,year
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27,1997
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24,1997
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23,1997
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24,1997
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05,1996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04,1995
1506,946,8,D,ca1dg@cunb.edu,1,1,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23,1995
1507,946,8,D,amdrx@cunb.edu,1,1,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15,1994
1508,946,8,D,yn5ug@cunb.edu,1,2,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18,1994


## Missing Values

In [89]:
df_a.isna().sum()
df_a.isnull().sum()


# do nothing - assumption test not taken yet
df[df_a["language_t2"].isna()]
df[df_a["science_t2"].isna()]
df[df_a["math_t2"].isna()]

# df.dropna(subset=["language_t2", "science_t2", "math_t2"], inplace=True)

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
279,60,8,C,wa083@cunb.edu,2,2,5.218773,3.566172,3.838631,,5.159542,5.432,1,1994-11-25
848,458,8,A,whde3@cunb.edu,1,1,3.967109,2.758454,2.839566,,2.664794,2.745906,0,1995-06-18


In [111]:

# df_c.isnull().sum() # same
df_a[df_a.isnull().any(axis=1)]  # show all rows with null
df_a.isna().sum() / df_c.count()


school_id        0.000000
grade            0.000000
class            0.000000
student_id       0.000000
sex              0.000000
nationality      0.000000
math_t1          0.000000
language_t1      0.000000
science_t1       0.000000
math_t2          0.001338
language_t2      0.002007
science_t2       0.001338
treatment        0.000000
date_of_birth    0.000000
year             0.000000
dtype: float64

## Duplicates

In [None]:
# show all duplicate rows
df_a[df_a.duplicated(keep=False)].sort_values(by=list(df_a.columns)) # sorts by all columns

df_a[df_a.duplicated(keep="last")] # show only last duplicate rows

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth,year
39,57,6,C,hxddv@cunb.edu,1,1,8.478601,9.397897,9.220403,9.310329,10.0,10.0,1,1997-08-14,1997
150,57,6,C,hxddv@cunb.edu,1,1,8.478601,9.397897,9.220403,9.310329,10.0,10.0,1,1997-08-14,1997
377,141,7,C,xbpfw@cunb.edu,1,2,3.551834,3.61451,3.360211,6.036923,6.099598,5.8453,1,1996-08-23,1996
439,141,7,C,xbpfw@cunb.edu,1,2,3.551834,3.61451,3.360211,6.036923,6.099598,5.8453,1,1996-08-23,1996
484,262,6,D,icmoa@cunb.edu,2,1,6.352546,6.408562,7.955574,7.455129,7.511146,9.058157,0,1997-05-15,1997
595,262,6,D,icmoa@cunb.edu,2,1,6.352546,6.408562,7.955574,7.455129,7.511146,9.058157,0,1997-05-15,1997
509,262,7,B,kpoo9@cunb.edu,1,1,6.12044,5.928064,7.353246,3.811633,3.619257,5.044439,0,1995-09-14,1995
597,262,7,B,kpoo9@cunb.edu,1,1,6.12044,5.928064,7.353246,3.811633,3.619257,5.044439,0,1995-09-14,1995
551,262,8,B,q6c3l@cunb.edu,1,1,7.434781,7.259778,6.634327,9.370366,9.195363,8.569912,0,1995-06-20,1995
596,262,8,B,q6c3l@cunb.edu,1,1,7.434781,7.259778,6.634327,9.370366,9.195363,8.569912,0,1995-06-20,1995


In [99]:
df_b = df_a.drop_duplicates("student_id")  # keeps first occurrence
df_b.duplicated().sum()

np.int64(0)

## reset_index()

In [115]:
# fill holes in the index and drop the old index
df_b.reset_index(drop=True, inplace=True)

## Placeholder values

In [103]:

# check for placeholder values
df_b.isin([99999.0]).sum()

df_b[df_b.isin([99999.0]).any(axis=1)]

df_b.describe().loc["max"]


school_id                      946.0
math_t1                         10.0
language_t1                     10.0
science_t1                      10.0
math_t2                      99999.0
language_t2                  99999.0
science_t2                   99999.0
date_of_birth    1997-08-30 00:00:00
year                          1997.0
Name: max, dtype: object

## replace()

In [104]:
import numpy as np

df_c = df_b.copy()
df_c.math_t2 = df_b.math_t2.replace(99999.0, np.nan)
df_c.language_t2 = df_b.language_t2.replace(99999.0, np.nan)
df_c.science_t2 = df_b.science_t2.replace(99999.0, np.nan)

In [105]:
df_c.describe().loc["max"]

school_id                      946.0
math_t1                         10.0
language_t1                     10.0
science_t1                      10.0
math_t2                         10.0
language_t2                     10.0
science_t2                      10.0
date_of_birth    1997-08-30 00:00:00
year                          1997.0
Name: max, dtype: object