# Data Cleaning

<details><summary>Content</summary>

Handling missing data: .isnull(), .notnull(), .dropna(), .fillna(), .ffill(), .bfill()

Duplicates: .duplicated(), .drop_duplicates()

Replacing values: .replace()

Data type conversion: .astype()

Renaming: .rename(), setting column names directly

String operations: .str accessor methods

What to add:

Dealing with outliers

Converting dates: pd.to_datetime()

Categorical data: .astype('category')

.clip() for value bounds

.unique(), .nunique(), .value_counts()
</details>

<details><summary>Data Cleaning methods</summary>

`df.columns = ['a', 'b', 'c']` # Renames columns

`df.isnull()` # checks for null Values, Returns Boolean Array

`df.notnull()` # Opposite of pd.isnull()

`df.dropna()` # Drops all rows that contain null values

`df.dropna(axis=1)` # Drops all COLUMNS that contain null values

`df.dropna(axis=1, thresh=n)` # Drops all rows have have less than n non-null values

`df.fillna(x)` # Replaces all null values with x

`df.fillna(s.mean())` # Replaces all null values with the mean, median etc.

`df.astype(float)` # Converts the datatype of the Series to float

`df.replace(1, 'one')` # Replaces all values equal to 1 with one

`df.replace([1, 3], ['one','three'])` # Replaces all 1 with 'one' and 3 with 'three'

`df.columns.str.replace("grade_", "")` # renames columns

`df.rename(columns=lambda x: x + 1)` # Mass renaming of columns

`df.rename(columns={'old_name': 'new_name'})` # Selective renaming of columns

`df.rename(index=lambda x: x + 1)` # Mass renaming of index

`df.set_index('column_one')` # Selectively sets the index

</details>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("https://drive.google.com/uc?id=1oE-3rt17bFW7fOzDIjwFSEMPTIV3NvcO")
df[:3]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23


In [None]:
# len(df) # no. of rows
# df.describe()  # Basic descriptive statistics for each column (or GroupBy)
# df.info()
# df.shape
# df.dtypes # check data types in df
# df.school_id.nunique()
# df.school_id.unique()
df.school_id.value_counts()  # count number of rows per school_id

## Inspection

In [None]:
df.columns
df.shape
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   school_id          1510 non-null   int64  
 1   grade              1510 non-null   int64  
 2   class              1510 non-null   object 
 3   student_id         1510 non-null   object 
 4   sex                1510 non-null   int64  
 5   nationality        1510 non-null   int64  
 6   grade_math_t1      1510 non-null   float64
 7   grade_language_t1  1510 non-null   float64
 8   grade_science_t1   1510 non-null   float64
 9   grade_math_t2      1508 non-null   float64
 10  grade_language_t2  1507 non-null   float64
 11  grade_science_t2   1508 non-null   float64
 12  treatment          1510 non-null   int64  
 13  date_of_birth      1510 non-null   object 
dtypes: float64(6), int64(5), object(3)
memory usage: 165.3+ KB


## replace()

In [4]:
df.columns = df.columns.str.replace("grade_", "")
df

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04
1506,946,8,D,ca1dg@cunb.edu,1,1,7.661931,8.360153,7.937165,8.179364,8.877586,8.454598,0,1995-08-23
1507,946,8,D,amdrx@cunb.edu,1,1,9.248758,8.534791,8.056641,8.773601,8.059634,7.581484,0,1994-12-15
1508,946,8,D,yn5ug@cunb.edu,1,2,5.689228,5.071503,5.338547,6.382995,5.765270,6.032315,0,1994-09-18


## astype()

In [None]:
# reduce memory usage - spot categories
df.treatment.unique()
df.sex.unique()
df.nationality.unique()
df.grade.unique()
df["class"].unique()


array(['A', 'B', 'C', 'D'], dtype=object)

In [6]:
df_a = df.copy()
df_a[["grade", "class", "nationality", "sex", "treatment"]] = df_a[["grade", "class", "nationality", "sex", "treatment"]].astype("category")
df_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   school_id      1510 non-null   int64   
 1   grade          1510 non-null   category
 2   class          1510 non-null   category
 3   student_id     1510 non-null   object  
 4   sex            1510 non-null   category
 5   nationality    1510 non-null   category
 6   math_t1        1510 non-null   float64 
 7   language_t1    1510 non-null   float64 
 8   science_t1     1510 non-null   float64 
 9   math_t2        1508 non-null   float64 
 10  language_t2    1507 non-null   float64 
 11  science_t2     1508 non-null   float64 
 12  treatment      1510 non-null   category
 13  date_of_birth  1510 non-null   object  
dtypes: category(5), float64(6), int64(1), object(2)
memory usage: 114.4+ KB


## to_datetime

In [13]:
df_a["date_of_birth"][:2]

0    1997-07-27
1    1997-06-24
Name: date_of_birth, dtype: object

In [15]:
df_a["date_of_birth"] = pd.to_datetime(df_a["date_of_birth"], errors="coerce")
df_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   school_id      1510 non-null   int64         
 1   grade          1510 non-null   category      
 2   class          1510 non-null   category      
 3   student_id     1510 non-null   object        
 4   sex            1510 non-null   category      
 5   nationality    1510 non-null   category      
 6   math_t1        1510 non-null   float64       
 7   language_t1    1510 non-null   float64       
 8   science_t1     1510 non-null   float64       
 9   math_t2        1508 non-null   float64       
 10  language_t2    1507 non-null   float64       
 11  science_t2     1508 non-null   float64       
 12  treatment      1510 non-null   category      
 13  date_of_birth  1505 non-null   datetime64[ns]
dtypes: category(5), datetime64[ns](1), float64(6), int64(1), object(1)
memor

In [None]:
# df_a.groupby("school_id").count() # count of row per school
# round(df_a.math_t1.mean(), 2)
# df_a.math_t2.agg(["min", "max"])
# df_a.math_t2.max()
# df_a.math_t2.sort_values().tail(10)
# df_a.math_t2.sort_values(ascending=False).head(10)
# df_a.language_t1.sort_values(ascending=False).tail(10)
# df_a.language_t2.sort_values(ascending=False).head(10)

In [None]:
df[df.duplicated()]
df[df.duplicated(keep="last")]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
39,57,6,C,hxddv@cunb.edu,1,1,8.478601,9.397897,9.220403,9.310329,10.0,10.0,1,1997-08-14
377,141,7,C,xbpfw@cunb.edu,1,2,3.551834,3.61451,3.360211,6.036923,6.099598,5.8453,1,1996-08-23
484,262,6,D,icmoa@cunb.edu,2,1,6.352546,6.408562,7.955574,7.455129,7.511146,9.058157,0,1997-05-15
509,262,7,B,kpoo9@cunb.edu,1,1,6.12044,5.928064,7.353246,3.811633,3.619257,5.044439,0,1995-09-14
551,262,8,B,q6c3l@cunb.edu,1,1,7.434781,7.259778,6.634327,9.370366,9.195363,8.569912,0,1995-06-20
690,426,7,D,jyj0s@cunb.edu,1,2,2.310155,2.757169,2.258084,3.836512,4.283526,3.784441,1,1995-12-19
739,426,8,C,9onh3@cunb.edu,1,1,5.514787,5.204074,5.131349,4.983463,4.672751,4.600025,1,1994-09-07
792,458,6,D,xnthj@cunb.edu,1,2,5.552781,6.110878,6.38448,7.558939,8.117036,8.390638,1,1996-11-02
1004,499,8,A,e9whz@cunb.edu,1,1,8.268938,7.926082,8.25887,9.584604,9.241748,9.574536,1,1995-01-06
1464,946,8,A,eohgp@cunb.edu,1,1,9.768719,10.0,8.493072,9.515601,9.746882,8.239953,0,1994-12-19


In [None]:
df_a.duplicated("student_id").sum() 


# show ALL duplicates with keep=False
df_a[df_a.duplicated("student_id", keep=False)].sort_values("student_id")  

df_b = df_a.drop_duplicates("student_id") # keeps first occurrence
df_b.duplicated().sum()

np.int64(0)

In [None]:
df_b.isin([99999.0]).sum()
df_b[df_b.isin([99999.0]).any(axis=1)]
df_b.describe().loc["max"]

school_id        946.0
math_t1           10.0
language_t1       10.0
science_t1        10.0
math_t2        99999.0
language_t2    99999.0
science_t2     99999.0
treatment          1.0
Name: max, dtype: float64

In [None]:
import numpy as np

df_c = df_b.copy()
df_c.math_t2 = df_b.math_t2.replace(99999.0, np.nan)
df_c.language_t2 = df_b.language_t2.replace(99999.0, np.nan)
df_c.science_t2 = df_b.science_t2.replace(99999.0, np.nan)

In [None]:
df_c.describe().loc["max"]

school_id      946.0
math_t1         10.0
language_t1     10.0
science_t1      10.0
math_t2         10.0
language_t2     10.0
science_t2      10.0
treatment        1.0
Name: max, dtype: float64

In [None]:
df_c.isna().sum()/df.count()

# df_c.isnull().sum() # same
# df_c[df_c.isnull().any(axis=1)] # show all rows with null

school_id        0.000000
grade            0.000000
class            0.000000
student_id       0.000000
sex              0.000000
nationality      0.000000
math_t1          0.000000
language_t1      0.000000
science_t1       0.000000
math_t2          0.003316
language_t2      0.003318
science_t2       0.003316
treatment        0.000000
date_of_birth    0.000000
dtype: float64

In [None]:
# df_b[df_b.isna().any(axis=1)]
# df_b[df_b.isna().any(axis=1)]

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,math_t1,language_t1,science_t1,math_t2,language_t2,science_t2,treatment,date_of_birth
96,57,7,D,q2cfd@cunb.edu,1,1,5.492753,6.74379,6.164719,99999.0,6.838829,6.259758,1,1996-01-03
153,60,6,A,xtycv@cunb.edu,1,2,4.402209,4.207442,5.461627,5.159038,4.964271,99999.0,1,1996-10-09
169,60,6,C,pmeiz@cunb.edu,2,2,6.871348,7.954888,7.201834,10.0,10.0,99999.0,1,1997-06-19
404,141,8,B,owwrw@cunb.edu,2,2,5.146591,4.237231,4.902192,99999.0,5.97974,6.644701,1,1995-01-08
1178,812,8,B,pkiie@cunb.edu,1,2,6.973558,7.397025,8.026027,9.539605,9.963072,99999.0,1,1994-12-12
1202,812,8,D,l3xtc@cunb.edu,2,1,6.997463,7.474309,6.601118,6.705059,99999.0,6.308715,0,1995-01-12
1284,881,7,B,xhois@cunb.edu,1,1,7.222708,7.444067,7.647009,99999.0,7.353987,7.55693,1,1996-01-12
1440,946,7,D,10lcd@cunb.edu,1,1,6.882573,7.320166,7.075527,7.075174,99999.0,7.268128,0,1996-08-12


## drop rows

In [None]:
df1.drop(-1, inplace=True)  # drop row by index 
df1

Unnamed: 0,x,y,z
0.0,,17.0,9.0
1.0,1.0,2.0,3.0
1.5,0.0,100.0,0.0
3.0,4.0,,9.0
4.0,0.0,1.0,2.0


In [None]:
df1.drop([0, 3],  inplace=True) # drop a few rows by index
df1

Unnamed: 0,x,y,z
1.0,1.0,2.0,3.0
1.5,0.0,100.0,0.0
4.0,0.0,1.0,2.0


In [None]:
df1.drop(df1.index.max(), inplace=True) #drop row with highest index
df1.drop(df1.index.min(), inplace=True) # drop row with smallest index
df1

Unnamed: 0,x,y,z
1,0.0,1.0,2.0
2,1.0,2.0,3.0
3,0.0,,2.0
4,,19.0,2.0


## drop_duplicates()

In [None]:
df11= pd.DataFrame( [[90, 19, 2], [90, 19, 56], [100, 10, 56]], index=[8,9,10], columns=['x','y','z'])
df11.loc[8.5] = [90, 19, 2] # insert row with index between 8 and 9
df11 = df11.sort_index().reset_index(drop=True) # sort index, otherwise the index will be 8.5 and the rows are all over the place
df11

Unnamed: 0,x,y,z
0,90,19,2
1,90,19,2
2,90,19,56
3,100,10,56


In [None]:
df11.drop_duplicates() # drop identical rows

Unnamed: 0,x,y,z
0,90,19,2
2,90,19,56
3,100,10,56


In [None]:
df11.drop_duplicates('z') # drop rows with same values in one column

Unnamed: 0,x,y,z
0,90,19,2
2,90,19,56


## dropna

In [None]:
df1.dropna(subset='y')      # drop row with NA in a specific column

Unnamed: 0,x,y,z
1,0.0,1.0,2.0
2,1.0,2.0,3.0
4,,19.0,2.0


In [None]:
# Drop all rows that contain null values
# axis=1 would drop all coumns containing na 
df1.dropna(axis=0, inplace=True)
df1

Unnamed: 0,x,y,z
1,0.0,1.0,2.0
2,1.0,2.0,3.0
6,90.0,19.0,2.0
7,0.0,1.0,56.0
8,100.0,1.0,56.0


In [None]:
df1.dropna() 
print(df1)                                    # Drop all rows that contain null values, this is a passing operation
df1.dropna(inplace=True)        # this changes the df
df1

   x    y    z
1  1  2.0  3.0
2  4  NaN  NaN
3  7  8.0  9.0


Unnamed: 0,x,y,z
1,1,2.0,3.0
3,7,8.0,9.0
