In [1]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load input data
df = pd.read_csv("../data/college_scorecard.csv")

In [3]:
# Get the basic information of the input dataframe.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   institution_name  800 non-null    object 
 1   major_field       768 non-null    object 
 2   degree_level      800 non-null    object 
 3   median_earnings   800 non-null    int64  
 4   employment_rate   789 non-null    float64
 5   debt_median       776 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 37.6+ KB


In [4]:
# Find the missing values in each column.
df.isnull().sum()

institution_name     0
major_field         32
degree_level         0
median_earnings      0
employment_rate     11
debt_median         24
dtype: int64

In [5]:
# Fill the NaN valuse in major field by mode.
major_field_mode = df['major_field'].mode()[0]
df['major_field'] = df['major_field'].fillna(major_field_mode)

In [6]:
# Replace negative or >1 employment_rate values with NaN
df['employment_rate'] = df['employment_rate'].apply(lambda x: x if 0 <= x <= 1 else np.nan)

In [7]:
# Group-wise median imputation (by major_field + degree_level)
df['employment_rate'] = df.groupby(['major_field', 'degree_level'])['employment_rate'].transform(lambda x: x.fillna(x[x.notna() & (x >= 0) & (x <= 1)].median()))

In [8]:
# Convert debt median column to float.
df['debt_median'] = pd.to_numeric(df['debt_median'], errors='coerce')

In [9]:
# Replace NaN in debt median column by median from major field and degree level.
debt_median_values = df.groupby(['major_field', 'degree_level'])['debt_median'].transform('median')
df['debt_median'] = df['debt_median'].fillna(debt_median_values) 

In [10]:
# Earning should not be negative. So, identifing the distribution is skewed or normal distribution.
positive_earning_df = df.loc[df['median_earnings']>0]

In [11]:
# Fill the values less than zero as nan, so the median won't affect.
df['median_earnings'] = df['median_earnings'].apply(lambda x: x if x > 0 else np.nan)

In [12]:
# Group-wise median imputation using transform
df['median_earnings'] = df.groupby(['major_field', 'degree_level'])['median_earnings'].transform(lambda x: x.fillna(x.median()))

In [13]:
# Verify if all the missing values are filled.
df.isnull().sum()

institution_name    0
major_field         0
degree_level        0
median_earnings     0
employment_rate     0
debt_median         0
dtype: int64