#### Let's preprocess each column

In [1959]:
import pandas as pd
import numpy as np

### Reading the excel file

In [1960]:
df = pd.read_excel("../data/processed/preprocessed_data.xlsx")

In [1961]:
df.columns

Index(['id', 'acceptance_rate', 'institution', 'program', 'degree_type',
       'decision', 'undergrad_gpa', 'gre_quantitative_reasoning',
       'gre_verbal_reasoning', 'analytical_writing', 'notes', 'gre_total'],
      dtype='object')

### Checking the maximum and minimum acceptance rate range to see whether it is valid or not.

In [1962]:
df['acceptance_rate'] = pd.to_numeric(df['acceptance_rate'].str.replace('%', ''), errors='coerce')

In [1963]:
acceptance_rate_max = df['acceptance_rate'].max(skipna=True)
acceptance_rate_min = df['acceptance_rate'].min(skipna=True)

print("Max:", acceptance_rate_max)
print("Min:", acceptance_rate_min)


Max: 100.0
Min: 6.0


### Since the range is between 6% to 100%, the acceptance rate data seems fairly valid

_____________________________________________________________________________________________________________________________________________________________________________________

### Converting the gpa to numeric values

In [1964]:
df['undergrad_gpa'] = pd.to_numeric(df['undergrad_gpa'], errors='coerce')

### Checking the range of maximum and mininum of gpa

In [1965]:
undergrad_gpa_max = df['undergrad_gpa'].max(skipna=True)
undergrad_gpa_min = df['undergrad_gpa'].min(skipna=True)

print("Max:", undergrad_gpa_max)
print("Min:", undergrad_gpa_min)


Max: 332.0
Min: 0.01


### Valid GPA case 1

In [1966]:
gpa_greater_than_equal_to_one = df['undergrad_gpa'] >= 1.0
gpa_less_than_equal_to_four = df['undergrad_gpa'] <= 4.0
valid_gpa_between_one_and_four = df[gpa_greater_than_equal_to_one & gpa_less_than_equal_to_four]
len(valid_gpa_between_one_and_four)

44138

### Valid GPA Case 2
##### Sometimes the user may enter the gpa without the '.', hence we should check if someone has entered values between 100 to 332(since 332 is the vax value)

In [1967]:
gpa_greater_than_equal_to_hundred = df['undergrad_gpa'] >= 100
gpa_less_than_equal_to_four_hundred = df['undergrad_gpa'] < 401
valid_gpa_between_one_hundred_and_below_four_hundred = df.loc[(gpa_greater_than_equal_to_hundred) & (gpa_less_than_equal_to_four_hundred)]
valid_gpa_between_one_hundred_and_below_four_hundred

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
23246,965397,,University of California,Informatics,PhD,Accepted,332.0,,,interviewed on 1/28,Acceptance,


In [1968]:
condition = gpa_greater_than_equal_to_hundred & gpa_less_than_equal_to_four_hundred

In [1969]:
df.loc[condition, 'undergrad_gpa'] = df.loc[condition, 'undergrad_gpa'] / 100

In [1970]:
fresh_mask = (df['undergrad_gpa'] >= 100) & (df['undergrad_gpa'] <= 400)
len(df.loc[fresh_mask])

0

### Invalid GPA

In [1971]:
# Step 1: Identify invalid GPA entries
invalid_gpa_mask = (
    (df['undergrad_gpa'] < 1.0) |
    (df['undergrad_gpa'] > 4.0) |  # You probably meant 4.0, not 400
    (df['undergrad_gpa'].isna())
)

In [1972]:
# Step 2: Generate random GPAs in a realistic range (around 3.8)
random_gpas = np.clip(
    np.random.normal(loc=3.8, scale=0.1, size=invalid_gpa_mask.sum()),
    3.6, 4.0
)

# Step 3: Replace invalid entries with these generated values
df.loc[invalid_gpa_mask, 'undergrad_gpa'] = random_gpas

In [1973]:
df['undergrad_gpa'].isnull().sum()

0

_____________________________________________________________________________________________________________________________________________________

### The GRE General Test has three scoring sections, each with its own score range:

#### Verbal Reasoning : Score range: 130 to 170

#### Quantitative Reasoning : Score range: 130 to 170

#### Analytical Writing : Score range: 0 to 6 

In [1974]:
# Convert GRE scores to numeric
list_of_gre_columns = ['gre_quantitative_reasoning', 'gre_verbal_reasoning']
df[list_of_gre_columns[0]] = pd.to_numeric(df[list_of_gre_columns[0]], errors = 'coerce')
df[list_of_gre_columns[1]] = pd.to_numeric(df[list_of_gre_columns[1]], errors = 'coerce')

In [1975]:
gre_quantitative_reasoning_min = df[list_of_gre_columns[0]] > 129.0
gre_quantitative_reasoning_max = df[list_of_gre_columns[0]] < 171.0
valid_gre_quantitative_reasoning_profiles = df[gre_quantitative_reasoning_min & gre_quantitative_reasoning_max]
len(valid_gre_quantitative_reasoning_profiles)

10687

In [1976]:
gre_verbal_reasoning_min = df[list_of_gre_columns[1]] > 129.0
gre_verbal_reasoning_max = df[list_of_gre_columns[1]] < 171.0
valid_gre_quantitative_reasoning_profiles = df[gre_quantitative_reasoning_min & gre_quantitative_reasoning_max]
len(valid_gre_quantitative_reasoning_profiles)

10687

### Approach
Maybe a person entered their entire score instead of gre general. In that case the score boundary should be within 260 to 340. Else if it exceeds that then the same is neglected.

In [1977]:
invalid_gre_quantitative_reasoning_profiles = df[~(gre_quantitative_reasoning_min & gre_quantitative_reasoning_max) & ~df[list_of_gre_columns[0]].isna()]
len(invalid_gre_quantitative_reasoning_profiles)

1568

In [1978]:
invalid_gre_verbal_reasoning_profiles = df[~(gre_verbal_reasoning_min & gre_verbal_reasoning_max) & ~df[list_of_gre_columns[1]].isna()]
len(invalid_gre_verbal_reasoning_profiles)

68

#### Assigning values from 260 to 340 to gre_total column

In [1979]:
df['gre_total'] = df.loc[(df[list_of_gre_columns[0]] >= 260) & (df[list_of_gre_columns[0]] <= 340), list_of_gre_columns[0]]

In [1980]:
df['gre_total'] = df.loc[(df[list_of_gre_columns[1]] >= 260) & (df[list_of_gre_columns[1]] <= 340), list_of_gre_columns[1]]

#### Removing values 260 to 340 from gre_general column to gre_total column

In [1981]:
df.loc[(df[list_of_gre_columns[0]] >= 260) & (df[list_of_gre_columns[0]] <= 340), list_of_gre_columns[0]] = 0

In [1982]:
df.loc[(df[list_of_gre_columns[1]] >= 260) & (df[list_of_gre_columns[1]] <= 340), list_of_gre_columns[1]] = 0

### Re-checking if there are any values between 260 and 340

In [1983]:
len(df.loc[(df[list_of_gre_columns[0]] >= 260) & (df[list_of_gre_columns[0]] <= 340)])

0

In [1984]:
len(df.loc[(df[list_of_gre_columns[1]] >= 260) & (df[list_of_gre_columns[1]] <= 340)])

0

### Checking rows with values :
#### Below 130 
#### Above 340
#### Between 170 to 260

#### Checking for quantitative values

In [1985]:
df[list_of_gre_columns[0]] = pd.to_numeric(df[list_of_gre_columns[0]], errors='coerce')

In [None]:
df[list_of_gre_columns[0]].isna().sum()

88370

In [1994]:
(~df[list_of_gre_columns[0]].isna()).sum()

12255

In [1995]:
rows_above_340_gre_quantitative_reasoning = (
    (df[list_of_gre_columns[0]] > 340) |
    (df[list_of_gre_columns[0]] < 130) |
    (df[list_of_gre_columns[0]] >= 170) |
    (df[list_of_gre_columns[0]] < 260)  # This overlaps with < 130 and < 170
).sum()

print(f"Invalid rows for {list_of_gre_columns}: {rows_above_340_gre_quantitative_reasoning}")

Invalid rows for ['gre_quantitative_reasoning', 'gre_verbal_reasoning']: 12255


#### Checking for verbal values

In [1987]:
rows_above_340_gre_verbal_reasoning = (df[list_of_gre_columns[1]] >= 260).sum()
rows_above_340_gre_verbal_reasoning

8

In [None]:
# df.loc[df[list_of_gre_columns[0] > 260]]

TypeError: '>' not supported between instances of 'str' and 'int'

In [None]:
gre_quantitative_reasoning_above_170_and_below_260 = df.loc[(df['gre_quantitative_reasoning'] > 170) & (df['gre_quantitative_reasoning'] < 260), 'gre_quantitative_reasoning']
len(gre_quantitative_reasoning_above_170_and_below_260)

12

In [None]:
df = pd.to_numeric(df['gre_verbal_reasoning'])

### Checking valid GRE Verbal scores

In [None]:
df['gre_score_verbal_max'] 

KeyError: 'gre_score_verbal_max'

In [None]:
condition = (df['undergrad_gpa'] >= 130) & (df['undergrad_gpa'] < 171)

In [None]:
df['gre_verbal'] = pd.to_numeric(df['gre_verbal'], errors = 'coerce')

In [None]:
gre_score_general_min = df['gre_verbal'] >= 130
gre_score_general_max = df['gre_verbal'] <= 170

In [None]:
df['analytical_writing'] = pd.to_numeric(df['analytical_writing'], errors = 'coerce')

In [None]:
gre_analytical_writing_min = df['analytical_writing'] >= 0.0
gre_analytical_writing_max = df['analytical_writing'] >= 6.0

### Correlation between different variables and their visualisation

In [None]:
# Convert relevant fields
df['decision_encoded'] = df['decision'].map({
    'Accepted': 1,
    'Rejected': 0,
    'Interview': 0.5,  # optional: if you want to include partial decision stages
    'Waitlisted': 0.25,
    'Other': None
})
