#### Let's preprocess each column

In [7044]:
import pandas as pd
import numpy as np
import plotly.express as px
import random
from scipy.stats import truncnorm

### Reading the excel file

In [7045]:
df = pd.read_excel("../data/processed/preprocessed_data.xlsx")

In [7046]:
# # Step 1: Map unique institutions to their known (non-null) acceptance rates
# institution_acceptance_map = (
#     df[df['acceptance_rate'].notnull()]
#     .groupby('institution')['acceptance_rate']
#     .first()
#     .to_dict()
# )

# # Step 2: Find rows where acceptance_rate is null
# null_acceptance_rows = df[df['acceptance_rate'].isnull()]

# # Step 3 & 4: Fill missing acceptance rates based on institution name
# df.loc[df['acceptance_rate'].isnull(), 'acceptance_rate'] = df.loc[
#     df['acceptance_rate'].isnull(), 'institution'
# ].map(institution_acceptance_map)

# # Institutions with missing acceptance_rate
# institutions_with_nulls = set(df[df['acceptance_rate'].isnull()]['institution'])

# # Institutions for which we have a known acceptance_rate
# institutions_with_data = set(institution_acceptance_map.keys())

# # See which institutions could not be matched
# unmatched_institutions = institutions_with_nulls - institutions_with_data
# print(f"Unmatched institutions: {len(unmatched_institutions)}")


In [7047]:
df.columns

Index(['id', 'acceptance_rate', 'institution', 'program', 'degree_type',
       'decision', 'undergrad_gpa', 'gre_quantitative_reasoning',
       'gre_verbal_reasoning', 'analytical_writing', 'notes', 'gre_total'],
      dtype='object')

### Checking the maximum and minimum acceptance rate range to see whether it is valid or not.

In [7048]:
df['acceptance_rate'] = df['acceptance_rate'].astype(str).str.replace('%', '', regex=False)
df['acceptance_rate'] = pd.to_numeric(df['acceptance_rate'], errors='coerce')

In [7049]:
acceptance_rate_max = df['acceptance_rate'].max(skipna=True)
acceptance_rate_min = df['acceptance_rate'].min(skipna=True)

print("Max:", acceptance_rate_max)
print("Min:", acceptance_rate_min)


Max: 100.0
Min: 0.06


### Since the range is between 0.06% to 100%, the acceptance rate data seems fairly valid

_____________________________________________________________________________________________________________________________________________________________________________________

### Converting the gpa to numeric values

In [7050]:
df['undergrad_gpa'] = pd.to_numeric(df['undergrad_gpa'], errors='coerce')

### Checking the range of maximum and mininum of gpa

In [7051]:
undergrad_gpa_max = df['undergrad_gpa'].max(skipna=True)
undergrad_gpa_min = df['undergrad_gpa'].min(skipna=True)

print("Max:", undergrad_gpa_max)
print("Min:", undergrad_gpa_min)


Max: 332.0
Min: 0.01


### Valid GPA case 1

In [7052]:
gpa_greater_than_equal_to_one = df['undergrad_gpa'] >= 1.0
gpa_less_than_equal_to_four = df['undergrad_gpa'] <= 4.0
valid_gpa_between_one_and_four = df[gpa_greater_than_equal_to_one & gpa_less_than_equal_to_four]
len(valid_gpa_between_one_and_four)

44138

### Valid GPA Case 2
##### Sometimes the user may enter the gpa without the '.', hence we should check if someone has entered values between 100 to 332(since 332 is the vax value)

In [7053]:
gpa_greater_than_equal_to_hundred = df['undergrad_gpa'] >= 100
gpa_less_than_equal_to_four_hundred = df['undergrad_gpa'] < 401
valid_gpa_between_one_hundred_and_below_four_hundred = df.loc[(gpa_greater_than_equal_to_hundred) & (gpa_less_than_equal_to_four_hundred)]
valid_gpa_between_one_hundred_and_below_four_hundred

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
23246,965397,0.32,University of California,Informatics,PhD,Accepted,332.0,,,interviewed on 1/28,Acceptance,


In [7054]:
condition = gpa_greater_than_equal_to_hundred & gpa_less_than_equal_to_four_hundred

In [7055]:
df.loc[condition, 'undergrad_gpa'] = df.loc[condition, 'undergrad_gpa'] / 100

In [7056]:
fresh_mask = (df['undergrad_gpa'] >= 100) & (df['undergrad_gpa'] <= 400)
len(df.loc[fresh_mask])

0

### Invalid GPA

In [7057]:
# Step 1: Identify invalid GPA entries
invalid_gpa_mask = (
    (df['undergrad_gpa'] < 1.0) |
    (df['undergrad_gpa'] > 4.0) |  # You probably meant 4.0, not 400
    (df['undergrad_gpa'].isna())
)

In [7058]:
# Step 2: Generate random GPAs in a realistic range (around 3.8)
random_gpas = np.clip(
    np.random.normal(loc=3.8, scale=0.1, size=invalid_gpa_mask.sum()),
    3.6, 4.0
)

# Step 3: Replace invalid entries with these generated values
df.loc[invalid_gpa_mask, 'undergrad_gpa'] = random_gpas

In [7059]:
df['undergrad_gpa'].isnull().sum()

0

_____________________________________________________________________________________________________________________________________________________

### The GRE General Test has three scoring sections, each with its own score range:

#### Verbal Reasoning : Score range: 130 to 170

#### Quantitative Reasoning : Score range: 130 to 170

#### Analytical Writing : Score range: 0 to 6 

In [7060]:
# Convert GRE scores to numeric
list_of_gre_columns = ['gre_quantitative_reasoning', 'gre_verbal_reasoning', 'analytical_writing']
df[list_of_gre_columns[0]] = pd.to_numeric(df[list_of_gre_columns[0]], errors = 'coerce')
df[list_of_gre_columns[1]] = pd.to_numeric(df[list_of_gre_columns[1]], errors = 'coerce')

In [7061]:
gre_quantitative_reasoning_min = df[list_of_gre_columns[0]] > 129.0
gre_quantitative_reasoning_max = df[list_of_gre_columns[0]] < 171.0
valid_gre_quantitative_reasoning_profiles = df[gre_quantitative_reasoning_min & gre_quantitative_reasoning_max]
len(valid_gre_quantitative_reasoning_profiles)

10687

In [7062]:
gre_verbal_reasoning_min = df[list_of_gre_columns[1]] > 129.0
gre_verbal_reasoning_max = df[list_of_gre_columns[1]] < 171.0
valid_gre_verbal_reasoning_profiles = df[gre_verbal_reasoning_min & gre_verbal_reasoning_max]
len(valid_gre_verbal_reasoning_profiles)

11026

### Approach
Maybe a person entered their entire score instead of gre general. In that case the score boundary should be within 260 to 340. Else if it exceeds that then the same is neglected.

In [7063]:
invalid_gre_quantitative_reasoning_profiles = df[~(gre_quantitative_reasoning_min & gre_quantitative_reasoning_max) & ~df[list_of_gre_columns[0]].isna()]
len(invalid_gre_quantitative_reasoning_profiles)

1568

In [7064]:
invalid_gre_verbal_reasoning_profiles = df[~(gre_verbal_reasoning_min & gre_verbal_reasoning_max) & ~df[list_of_gre_columns[1]].isna()]
len(invalid_gre_verbal_reasoning_profiles)

68

#### Assigning values from 260 to 340 to gre_total column

In [7065]:
df['gre_total'] = df.loc[(df[list_of_gre_columns[0]] >= 260) & (df[list_of_gre_columns[0]] <= 340), list_of_gre_columns[0]]

In [7066]:
df['gre_total'] = df.loc[(df[list_of_gre_columns[1]] >= 260) & (df[list_of_gre_columns[1]] <= 340), list_of_gre_columns[1]]

#### Removing values 260 to 340 from gre_general column to gre_total column

In [7067]:
df.loc[(df[list_of_gre_columns[0]] >= 260) & (df[list_of_gre_columns[0]] <= 340), list_of_gre_columns[0]] = 0

In [7068]:
df.loc[(df[list_of_gre_columns[1]] >= 260) & (df[list_of_gre_columns[1]] <= 340), list_of_gre_columns[1]] = 0

### Re-checking if there are any values between 260 and 340

In [7069]:
len(df.loc[(df[list_of_gre_columns[0]] >= 260) & (df[list_of_gre_columns[0]] <= 340)])

0

In [7070]:
len(df.loc[(df[list_of_gre_columns[1]] >= 260) & (df[list_of_gre_columns[1]] <= 340)])

0

## Filling in empty values for GRE quantitative reasoning

### Histogram for GRE Quantitative reasoning

In [7071]:
median_val = df[list_of_gre_columns[0]].median()
mode_val = df[list_of_gre_columns[0]].mode()[0]  # mode() returns a Series
mean_val = df[list_of_gre_columns[0]].mean()

print(f"Median: {median_val}, Mode: {mode_val}, Mean: {mean_val}")


Median: 165.0, Mode: 170.0, Mean: 144.23215014279884


### Checking rows with values :
#### Below 130 
#### Above 340
#### Between 170 to 260

#### Checking for quantitative values

In [7072]:
median_val = df[list_of_gre_columns[0]].median()
mode_val = df[list_of_gre_columns[0]].mode()[0]  # mode() returns a Series
mean_val = df[list_of_gre_columns[0]].mean()

print(f"Median: {median_val}, Mode: {mode_val}, Mean: {mean_val}")


Median: 165.0, Mode: 170.0, Mean: 144.23215014279884


In [7073]:
rows_above_340_gre_quantitative_reasoning = (
    (df[list_of_gre_columns[0]] > 340) |
    (df[list_of_gre_columns[0]] < 130) |
    ((df[list_of_gre_columns[0]] > 170) &
    (df[list_of_gre_columns[0]] < 340))  # This overlaps with < 130 and < 170
).sum()

print(f"Invalid rows for {list_of_gre_columns}: {rows_above_340_gre_quantitative_reasoning}")

Invalid rows for ['gre_quantitative_reasoning', 'gre_verbal_reasoning', 'analytical_writing']: 1568


In [7074]:
df.loc[
    (df[list_of_gre_columns[0]] < 130) |
    (df[list_of_gre_columns[0]] > 170),
    list_of_gre_columns[0]
] = 0

In [7075]:
df.loc[(df[list_of_gre_columns[0]] > 340) | (df[list_of_gre_columns[0]] < 130) | ((df[list_of_gre_columns[0]] > 170) & (df[list_of_gre_columns[0]] < 340))]

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
346,875371,36.0,University of Wisconsin,"Russian, East European And Central Asian Studies",Masters,Accepted,3.440000,0.0,161.0,5,,
389,873422,25.0,California Institute of Technology,Chemistry,PhD,Accepted,3.920000,0.0,170.0,99.99,,
927,875041,34.0,Johns Hopkins University,Electrical And Computer Engineering,PhD,Accepted,3.900000,0.0,168.0,99.99,Good Luck!,
1198,873010,46.0,Indiana University,Biostatistics,PhD,Interview,3.850000,0.0,154.0,3,,
1246,874863,50.0,Virginia Tech,Mathematics,PhD,Accepted,3.900000,0.0,165.0,99.99,,
...,...,...,...,...,...,...,...,...,...,...,...,...
99163,741573,51.0,Arizona State University,Behavioral Neuroscience,PhD,Accepted,3.633877,0.0,167.0,99.99,Fully funded! Super glad!,
99178,741552,38.0,Carnegie Mellon University,MIIPS,Masters,Accepted,3.720000,0.0,154.0,99.99,I was waitlisted in March but now I am accepte...,
99344,741315,39.0,Virginia Commonwealth University,Genetic Counseling,Masters,Accepted,3.630000,0.0,92.0,4,"Applied to 6, interviewed at 4",
100027,740314,40.0,West Virginia University,Accounting,PhD,Rejected,3.970000,0.0,0.0,,,315.0


In [7076]:
fig = px.histogram(valid_gre_quantitative_reasoning_profiles, x=list_of_gre_columns[0], nbins=30,
                   title=f'Distribution of Valid {list_of_gre_columns[0]} Scores')
fig.show()

In [7077]:
# Step 1: Get indices of rows where GRE score is missing or invalid
target_mask = (df['gre_quantitative_reasoning'].isna()) | (df['gre_quantitative_reasoning'] == 0)
target_indices = df[target_mask].index
missing_count = len(target_indices)  # Update count based on actual target rows

# Step 2–4: Generate tiered imputed values
low_pct = 0.40
mid_pct = 0.20
high_pct = 0.40

low_count = int(low_pct * missing_count)
mid_count = int(mid_pct * missing_count)
high_count = missing_count - low_count - mid_count  # ensure total matches

low_vals = np.random.uniform(130, 140, low_count)
mid_vals = np.random.uniform(140, 160, mid_count)
high_vals = np.random.uniform(165, 170, high_count)

combined_vals = np.concatenate([low_vals, mid_vals, high_vals])
rounded_vals = np.round(combined_vals)
rounded_vals[rounded_vals == 130] = 131

# Step 5: Assign back
df.loc[target_indices, 'gre_quantitative_reasoning'] = rounded_vals


In [7078]:
median_val = df[list_of_gre_columns[0]].median()
mode_val = df[list_of_gre_columns[0]].mode()[0]  # mode() returns a Series
mean_val = df[list_of_gre_columns[0]].mean()

print(f"Median: {median_val}, Mode: {mode_val}, Mean: {mean_val}")


Median: 154.0, Mode: 167.0, Mean: 152.3768049689441


In [7079]:
df['gre_quantitative_reasoning'].isna().sum()

0

### Checking verbal values

In [7080]:
median_val = df[list_of_gre_columns[1]].median()
mode_val = df[list_of_gre_columns[1]].mode()[0]  # mode() returns a Series
mean_val = df[list_of_gre_columns[1]].mean()

print(f"Median: {median_val}, Mode: {mode_val}, Mean: {mean_val}")


Median: 160.0, Mode: 162.0, Mean: 159.2960158644312


In [7081]:
df.loc[(df[list_of_gre_columns[1]] > 340) | (df[list_of_gre_columns[1]] < 130) | ((df[list_of_gre_columns[1]] > 170) & (df[list_of_gre_columns[0]] < 340))]

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
679,873277,33.0,Rice University,Philosophy,PhD,Accepted,3.90,169.0,6.0,99.99,email to check portal,
3357,872415,45.0,Georgetown University,Chemistry,PhD,Accepted,3.02,131.0,111.0,2,,
3628,876294,23.0,Yale University,Economics,PhD,Rejected,4.00,132.0,200.0,10,"After what I did to Rachel, the dean of admiss...",
5955,876603,24.0,Stanford University,Economics,PhD,Wait listed,2.20,134.0,2.0,23,"The groundhog saw its shadow, predicting 6 mor...",
8164,862452,38.0,Carnegie Mellon University,Electrical And Computer Engineering,Masters,Accepted,3.80,135.0,174.0,3.5,Received the status update email around 12:30 ...,
...,...,...,...,...,...,...,...,...,...,...,...,...
97589,743681,44.0,University Of Illinois,Bioengineering,PhD,Accepted,3.50,170.0,90.0,58,,
99329,741337,45.0,University of Southern California,Mechanical And Aerospace Engineering,Masters,Accepted,3.65,161.0,459.0,5,,
99344,741315,39.0,Virginia Commonwealth University,Genetic Counseling,Masters,Accepted,3.63,169.0,92.0,4,"Applied to 6, interviewed at 4",
100027,740314,40.0,West Virginia University,Accounting,PhD,Rejected,3.97,169.0,0.0,,,315.0


In [7082]:
rows_above_340_gre_verbal_reasoning = (
    (df[list_of_gre_columns[1]] > 340) |
    (df[list_of_gre_columns[1]] < 130) |
    ((df[list_of_gre_columns[1]] > 170) &
    (df[list_of_gre_columns[1]] < 340))  # This overlaps with < 130 and < 170
).sum()

print(f"Invalid rows for {list_of_gre_columns}: {rows_above_340_gre_verbal_reasoning}")

Invalid rows for ['gre_quantitative_reasoning', 'gre_verbal_reasoning', 'analytical_writing']: 68


In [7083]:
rows_above_340_gre_verbal_reasoning = df.loc[
    (df[list_of_gre_columns[1]] < 130) |
    (df[list_of_gre_columns[1]] > 170),
    list_of_gre_columns[1]
] = 0


In [7084]:
df.loc[(df[list_of_gre_columns[1]] > 340) | (df[list_of_gre_columns[1]] < 130) | ((df[list_of_gre_columns[1]] > 170) & (df[list_of_gre_columns[1]] < 340))]

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
679,873277,33.0,Rice University,Philosophy,PhD,Accepted,3.90,169.0,0.0,99.99,email to check portal,
3357,872415,45.0,Georgetown University,Chemistry,PhD,Accepted,3.02,131.0,0.0,2,,
3628,876294,23.0,Yale University,Economics,PhD,Rejected,4.00,132.0,0.0,10,"After what I did to Rachel, the dean of admiss...",
5955,876603,24.0,Stanford University,Economics,PhD,Wait listed,2.20,134.0,0.0,23,"The groundhog saw its shadow, predicting 6 mor...",
8164,862452,38.0,Carnegie Mellon University,Electrical And Computer Engineering,Masters,Accepted,3.80,135.0,0.0,3.5,Received the status update email around 12:30 ...,
...,...,...,...,...,...,...,...,...,...,...,...,...
97589,743681,44.0,University Of Illinois,Bioengineering,PhD,Accepted,3.50,170.0,0.0,58,,
99329,741337,45.0,University of Southern California,Mechanical And Aerospace Engineering,Masters,Accepted,3.65,161.0,0.0,5,,
99344,741315,39.0,Virginia Commonwealth University,Genetic Counseling,Masters,Accepted,3.63,169.0,0.0,4,"Applied to 6, interviewed at 4",
100027,740314,40.0,West Virginia University,Accounting,PhD,Rejected,3.97,169.0,0.0,,,315.0


In [7085]:
fig = px.histogram(valid_gre_verbal_reasoning_profiles, x=list_of_gre_columns[1], nbins=100, range_x=[129,171],
                   title=f'Distribution of Valid {list_of_gre_columns[1]} Scores')
fig.show()

In [7086]:
# Count of missing
missing_count = df['gre_verbal_reasoning'].isna().sum()

# Step 1: Generate truncated normal values in 130–170
mean, std = 149, 8  # Approx based on visual and given stats
lower, upper = 130, 170
a, b = (lower - mean) / std, (upper - mean) / std
random_vals = truncnorm.rvs(a, b, loc=mean, scale=std, size=missing_count)

# Step 2: Adjust to match exact target mean
existing_vals = df['gre_verbal_reasoning'].dropna()
target_mean = 149
target_total = target_mean * (len(existing_vals) + missing_count)
required_sum = target_total - existing_vals.sum()

scaling_factor = required_sum / np.sum(random_vals)
adjusted_vals = np.clip(random_vals * scaling_factor, 130, 170)

# Step 3: Fill in the values
df.loc[df['gre_verbal_reasoning'].isna(), 'gre_verbal_reasoning'] = adjusted_vals.round()


In [7087]:
(df[list_of_gre_columns[1]]).isna().sum()

0

In [7088]:
median_val = df[list_of_gre_columns[1]].median()
mode_val = df[list_of_gre_columns[1]].mode()[0]  # mode() returns a Series
mean_val = df[list_of_gre_columns[1]].mean()

print(f"Median: {median_val}, Mode: {mode_val}, Mean: {mean_val}")


Median: 149.0, Mode: 150.0, Mean: 149.00258385093167


In [7089]:
df

Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
0,879550,31.0,Washington University In St. Louis (WashU/WUSTL),English,PhD,Rejected,3.931486,132.0,146.0,,,
1,879549,32.0,University of California,Bioinformatics and Systems Biology,PhD,Rejected,3.990000,136.0,142.0,,Rejected at midnight PDT after interview. my C...,
2,879548,45.0,Purdue University,Sociology,PhD,Rejected,3.811025,138.0,152.0,,,
3,879547,44.0,University of Illinois,Physics,PhD,Accepted,3.560000,133.0,138.0,,"Acceptance letter from the department, officia...",
4,879546,40.0,Columbia University,History,PhD,Rejected,3.897579,140.0,150.0,,email from POI,
...,...,...,...,...,...,...,...,...,...,...,...,...
100620,739462,44.0,Georgia Institute of Technology,Computer Science,PhD,Rejected,3.805016,170.0,146.0,,,
100621,739461,36.0,University of Rochester,Economics,PhD,Wait listed,3.600000,169.0,147.0,,Is the admission process completely over? Any ...,
100622,739460,43.0,University of Florida,Economics,PhD,Other,4.000000,169.0,156.0,,TPB: the Economics Department in the UF is jus...,
100623,739459,45.0,University of Southern California,Political Science,PhD,Rejected,3.674888,170.0,158.0,3.5,"0a/4r/0p/0w. was in the wait list, got rejecte...",


In [7090]:
df['analytical_writing'] = pd.to_numeric(df[list_of_gre_columns[2]], errors = 'coerce')

In [7091]:
gre_analytical_writing_min = df[list_of_gre_columns[2]] >= 0.0
gre_analytical_writing_max = df[list_of_gre_columns[2]] <= 6.0

In [7092]:
valid_analytical_profile = df[gre_analytical_writing_min & gre_analytical_writing_max]
len(valid_analytical_profile)

10185

In [7093]:
# df.to_excel("../data/processed/preprocessed_final.xlsx")

In [7094]:
invalid_cases_for_gre_analytical = (
    (df[list_of_gre_columns[2]] < 1.0) |
    (df[list_of_gre_columns[2]] > 6.0)  # This overlaps with < 130 and < 170
).sum()

print(f"Invalid rows for {list_of_gre_columns[2]}: {invalid_cases_for_gre_analytical}")

Invalid rows for analytical_writing: 192


In [7095]:
df[list_of_gre_columns[2]] = pd.to_numeric(df[list_of_gre_columns[2]], errors=  'coerce')

In [7096]:
invalid_rows = df[(df[list_of_gre_columns[2]] < 1.0) | (df[list_of_gre_columns[2]] > 6.0)]

print("Invalid rows count:", len(invalid_rows))
invalid_rows.head()

Invalid rows count: 192


Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
389,873422,25.0,California Institute of Technology,Chemistry,PhD,Accepted,3.92,133.0,170.0,99.99,,
679,873277,33.0,Rice University,Philosophy,PhD,Accepted,3.9,169.0,0.0,99.99,email to check portal,
927,875041,34.0,Johns Hopkins University,Electrical And Computer Engineering,PhD,Accepted,3.9,132.0,168.0,99.99,Good Luck!,
1246,874863,50.0,Virginia Tech,Mathematics,PhD,Accepted,3.9,131.0,165.0,99.99,,
2254,874237,51.0,Texas A&M University,Electrical Engineering,PhD,Accepted,3.78,163.0,151.0,13.0,"TOEFL- 110, R30,L30,W27,S23",


In [7097]:
# Step 1: Work on a copy to protect the original DataFrame
temp_df = valid_analytical_profile.copy()

# Step 2: Clean and clip invalid scores
temp_df['analytical_writing_clipped'] = pd.to_numeric(
    temp_df['analytical_writing'], errors='coerce'
).apply(lambda x: x if 1.0 <= x <= 6.0 else np.nan)

# Step 3: Round to nearest 0.1
temp_df['analytical_writing_clipped'] = temp_df['analytical_writing_clipped'].round(1)

# Step 4: Plot histogram
fig = px.histogram(
    temp_df.dropna(subset=['analytical_writing_clipped']),
    x='analytical_writing_clipped',
    nbins=int((6.0 - 1.0) / 0.1),
    title='Distribution of GRE Analytical Writing Scores',
    labels={'analytical_writing_clipped': 'Analytical Writing Score'}
)

# Step 5: Format x-axis
fig.update_xaxes(
    tickmode='linear',
    tick0=1.0,
    dtick=0.3,
    range=[1.0, 6.0]
)

fig.update_layout(
    bargap=0.05,
    xaxis_title='Analytical Writing Score',
    yaxis_title='Count',
    title_x=0.5
)

fig.show()


In [7098]:
df.loc[(df[list_of_gre_columns[2]] < 1.0) | (df[list_of_gre_columns[2]] > 6.0), list_of_gre_columns[2]] = 0

In [7099]:
# Find indices where value is 0 or NaN
zero_or_nan_indices = df[df[list_of_gre_columns[2]].isna() | (df[list_of_gre_columns[2]] == 0)].index
zero_nan_count = len(zero_or_nan_indices)

# Redefine counts
high_count = int(high_pct * zero_nan_count)
mid_count = int(mid_pct * zero_nan_count)
low_count = zero_nan_count - high_count - mid_count

# Re-generate values and shuffle
high_vals = [round(random.uniform(4.0, 6.0), 1) for _ in range(high_count)]
mid_vals = [round(random.uniform(2.5, 4.0), 1) for _ in range(mid_count)]
low_vals = [round(random.uniform(1.0, 2.5), 1) for _ in range(low_count)]
generated_vals = high_vals + mid_vals + low_vals
random.shuffle(generated_vals)

# Assign the values
df.loc[zero_or_nan_indices, list_of_gre_columns[2]] = generated_vals


In [7100]:
final_values = (df[list_of_gre_columns[2]] < 1.0) | (df[list_of_gre_columns[2]] > 6.0) 
final_values.sum()

0

In [7101]:
df['gre_total'] = df['gre_quantitative_reasoning'] + df['gre_verbal_reasoning']

In [7102]:
df['gre_total'].isna().sum()

0

In [7103]:
(df['gre_total'] > 340).sum()

0

In [7104]:
# Remove date-like values from 'decision' column
# Step 1: Convert everything to string
df['decision'] = df['decision'].astype(str)

# Step 2: Filter out rows where 'decision' looks like a date
df = df[~df['decision'].str.contains(r'\d{4}-\d{2}-\d{2}|[0-9]{2}/[0-9]{2}/[0-9]{4}', regex=True)]

# Optional: Print unique values to verify cleanup
print(df['decision'].unique())

['Rejected' 'Accepted' 'Wait listed' 'Interview' 'Other']


In [7105]:
df.isna().sum()

id                                0
acceptance_rate                   0
institution                       0
program                           0
degree_type                       0
decision                          0
undergrad_gpa                     0
gre_quantitative_reasoning        0
gre_verbal_reasoning              0
analytical_writing                0
notes                         42073
gre_total                         0
dtype: int64

In [7106]:
df.to_excel("../data/processed/preprocessed_final.xlsx")