In [143]:
import pandas as pd
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

## Loading the new dataset

In [144]:
df = pd.read_excel("../data/processed/loaded_data.xlsx")

## We will pre-process the data and see what are all the missing and duplicate values present in the dataset

### Handling Missing values

In [145]:
df.isnull().sum()

id                       17
acceptance_rate        7923
institution              25
program                1242
degree_type            1177
decision               1180
undergrad_gpa         62618
gre_general           96604
gre_verbal            97836
analytical_writing    98529
notes                 45900
gre_total             97998
dtype: int64

### Columns that should NOT be missing (ideally):
id
→ Must be present and unique for every record.

institution
→ The university/school applied to — essential info.
🔸 Should not be missing.

program
→ Program applied to (e.g., MS CS) — needed for analysis.
🔸 Should not be missing.

degree_type
→ Such as MS, PhD, etc. — core info.
🔸 Should not be missing.

decision
→ Admission result (Accepted/Rejected) — central to analysis.
🔸 Should not be missing.



In [146]:
df = df.dropna(subset=['acceptance_rate', 'institution', 'program', 'degree_type', 'decision'])

In [147]:
df.isnull().sum()

id                        0
acceptance_rate           0
institution               0
program                   0
degree_type               0
decision                  0
undergrad_gpa         56143
gre_general           88676
gre_verbal            89848
analytical_writing    90504
notes                 42228
gre_total             90003
dtype: int64

In [148]:
df.shape

(100963, 12)

In [149]:
# Step 1: Find duplicate IDs
duplicate_ids = df[df.duplicated('id', keep=False)]

In [150]:
# Convert ID to string to ensure consistent type
duplicate_ids.loc[:,'id'] = duplicate_ids['id'].astype(str)

# Step 2: Sort by 'id' to group duplicates together
duplicate_ids_sorted = duplicate_ids.sort_values(by='id')


In [151]:
# Step 3: Inspect all rows with the same id
# You can group them and print each group for manual comparison
for dup_id, group in duplicate_ids_sorted.groupby('id'):
    if len(group) > 1:
        print(f"\nDuplicate ID: {dup_id}")
        print(group)


Duplicate ID: 817731
           id acceptance_rate                      institution  \
83003  817731             44%  Georgia Institute Of Technology   
82983  817731             44%  Georgia Institute Of Technology   

                program degree_type  decision undergrad_gpa gre_general  \
83003  Computer Science     Masters  Rejected           3.6         165   
82983  Computer Science     Masters  Rejected           3.6         165   

      gre_verbal analytical_writing notes gre_total  
83003        159                  4   NaN       324  
82983        159                  4   NaN       324  

Duplicate ID: 817732
           id acceptance_rate        institution           program  \
82982  817732             45%  Purdue University  Computer Science   
83002  817732             45%  Purdue University  Computer Science   

      degree_type  decision undergrad_gpa gre_general gre_verbal  \
82982     Masters  Accepted           NaN         NaN        NaN   
83002     Masters  Acc

In [152]:
# Remove rows that are exact duplicates (all columns identical)
df = df.drop_duplicates()


In [153]:
df.describe()

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_general,gre_verbal,analytical_writing,notes,gre_total
count,100629,100629,100629,100629,100629,100629,44646,12256,11095,10435,58553,10940
unique,100626,172,1130,7910,9,9,348,143,86,98,54280,185
top,964029,40%,University of California,Computer Science,PhD,Accepted,4,170,162,4,Email to check portal,330
freq,2,8486,4081,7527,64772,37248,5320,2106,793,2978,264,485


In [154]:
print(df['id'].duplicated().sum())

3


In [155]:
duplicate_ids = df[df.duplicated('id', keep=False)]


In [156]:
df = df.drop_duplicates(subset='id', keep='first')

In [157]:
print(df['id'].duplicated().sum())

0


In [158]:
df.describe()

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_general,gre_verbal,analytical_writing,notes,gre_total
count,100626,100626,100626,100626,100626,100626,44643,12256,11095,10435,58553,10940
unique,100626,172,1130,7910,9,9,348,143,86,98,54280,185
top,879550,40%,University of California,Computer Science,PhD,Accepted,4,170,162,4,Email to check portal,330
freq,1,8486,4080,7526,64771,37245,5320,2106,793,2978,264,485


In [192]:
df.to_excel("../data/processed/preprocessed_data.xlsx", index=False)