#### Let's preprocess each column

In [213]:
import pandas as pd

In [214]:
df = pd.read_excel("../data/processed/preprocessed_data.xlsx")

In [215]:
acceptance_rate_max = df['acceptance_rate_clean'].max(skipna=True)
acceptance_rate_min = df['acceptance_rate_clean'].min(skipna=True)

print("Max:", acceptance_rate_max)
print("Min:", acceptance_rate_min)


Max: 100.0
Min: 6.0


In [216]:
df['undergrad_gpa'] = pd.to_numeric(df['undergrad_gpa'], errors='coerce')

In [217]:
undergrad_gpa_max = df['undergrad_gpa'].max(skipna=True)
undergrad_gpa_min = df['undergrad_gpa'].min(skipna=True)

print("Max:", undergrad_gpa_max)
print("Min:", undergrad_gpa_min)


Max: 332.0
Min: 0.01


In [218]:
gpa_greater_than_equal_to_one = df.undergrad_gpa >= 1.0
gpa_less_than_equal_to_four = df.undergrad_gpa <= 4.0


In [219]:
valid_gpa_profiles = df[gpa_greater_than_equal_to_one & gpa_less_than_equal_to_four]
len(valid_gpa_profiles)

44138

In [220]:
invalid_gpa_profiles = df[~(gpa_greater_than_equal_to_one & gpa_less_than_equal_to_four)]
len(invalid_gpa_profiles)

56488

#### Sometimes the user may enter the gpa without the '.', hence we should check if someone has entered values between 100 to 332(since 332 is the vax value)

In [221]:
gpa_greater_than_equal_to_hundred = df['undergrad_gpa'] >= 100
gpa_less_than_equal_to_three_three_three = df['undergrad_gpa'] < 333
valid_gpa_profiles_above_hundred = df[gpa_greater_than_equal_to_hundred & gpa_less_than_equal_to_three_three_three]
valid_gpa_profiles_above_hundred
condition = (df['undergrad_gpa'] >= 100) & (df['undergrad_gpa'] < 332)

In [222]:
df['undergrad_gpa'] = df['undergrad_gpa'].mask(df['undergrad_gpa'] == condition, df['undergrad_gpa'] / 100)

In [223]:
valid_gpa_profiles_above_hundred = df[gpa_greater_than_equal_to_hundred & gpa_less_than_equal_to_three_three_three]
len(valid_gpa_profiles_above_hundred)

1

### The GRE General Test has three scored sections, each with its own score range:

#### Verbal Reasoning : Score range: 130 to 170

#### Quantitative Reasoning : Score range: 130 to 170

#### Analytical Writing : Score range: 0 to 6 

In [224]:
df['gre_general'] = pd.to_numeric(df['gre_general'], errors = 'coerce')

In [225]:
gre_score_general_min = df['gre_general'] >= 130.0
gre_score_general_max = df['gre_general'] <= 170.0
valid_gre_general_profiles = df[gre_score_general_min & gre_score_general_max]
len(valid_gre_general_profiles)

10687

### Approach
Maybe a person entered their entire score instead of gre general. In that case the score boundary should be within 260 to 340. Else if it exceeds that then the same is neglected.

In [238]:
invalid_gre_general_profiles = df[~(gre_score_general_min & gre_score_general_max) & ~df['gre_general'].isna()]
len(invalid_gre_general_profiles)

1379

In [None]:
gre_general_wrong_assigned_valid_scores = (df['gre_general'] >= 260) & (df['gre_general'] <= 340)
len(gre_general_wrong_assigned_valid_scores)

0         False
1         False
2         False
3         False
4         False
          ...  
100621    False
100622    False
100623    False
100624    False
100625    False
Name: gre_general, Length: 100626, dtype: bool

#### Assigning values from 260 to 340 to gre_total column

In [227]:
df['gre_total'] = df.loc[(df['gre_general'] >= 260) & (df['gre_general'] <= 340), 'gre_general']

#### Removing values from 260 to 340 to gre_total column

In [228]:
df.loc[(df['gre_general'] >= 260) & (df['gre_general'] <= 340), 'gre_general'] = 0

In [237]:
invalid_gre_general_profiles = df[~(gre_score_general_min & gre_score_general_max) & ~df['gre_general'].isna()]
len(invalid_gre_general_profiles)

1379

In [230]:
gre_score_verbal_min = df['gre_general'] >= 130
gre_score_verbal_max = df['gre_general'] <= 170
valid_gre_verbal_profiles = df[gre_score_verbal_min & gre_score_verbal_max]
len(valid_gre_verbal_profiles)

10687

In [231]:
condition = (df['undergrad_gpa'] >= 130) & (df['undergrad_gpa'] < 171)

In [232]:
df['gre_verbal'] = pd.to_numeric(df['gre_verbal'], errors = 'coerce')

In [233]:
gre_score_general_min = df['gre_verbal'] >= 130
gre_score_general_max = df['gre_verbal'] <= 170

In [234]:
df['analytical_writing'] = pd.to_numeric(df['analytical_writing'], errors = 'coerce')

In [235]:
gre_analytical_writing_min = df['analytical_writing'] >= 0.0
gre_analytical_writing_max = df['analytical_writing'] >= 6.0

### Correlation between different variables and their visualisation

In [236]:
# Convert relevant fields
df['decision_encoded'] = df['decision'].map({
    'Accepted': 1,
    'Rejected': 0,
    'Interview': 0.5,  # optional: if you want to include partial decision stages
    'Waitlisted': 0.25,
    'Other': None
})
