In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [234]:
df = pd.read_excel("../data/processed/preprocessed_final.xlsx")

In [235]:
# Select categorical columns for one-hot encoding
categorical_columns = ['institution', 'program', 'degree_type', 'decision']

In [236]:
# Initialize label encoder
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the encoder for future inverse mapping if needed


In [237]:

le = label_encoders[categorical_columns[3]]
print("Label mapping for 'decision':")
for idx, label in enumerate(le.classes_):
    print(f"  {label} => {idx}")


Label mapping for 'decision':
  Accepted => 0
  Interview => 1
  Other => 2
  Rejected => 3
  Wait listed => 4


In [238]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,acceptance_rate,institution,program,degree_type,decision,undergrad_gpa,gre_quantitative_reasoning,gre_verbal_reasoning,analytical_writing,notes,gre_total
0,0,879550,31.0,1070,2848,6,3,3.931486,132,146,2.0,,278
1,1,879549,32.0,861,687,6,3,3.99,136,142,5.6,Rejected at midnight PDT after interview. my C...,278
2,2,879548,45.0,520,7281,6,3,3.811025,138,152,4.4,,290
3,3,879547,44.0,906,6375,6,0,3.56,133,138,4.7,"Acceptance letter from the department, officia...",271
4,4,879546,40.0,155,3669,6,3,3.897579,140,150,6.0,email from POI,290


### Create new features


In [239]:
df['gpa_percentile'] = df['undergrad_gpa'] / 4.0  # assuming 4.0 scale
df['gre_avg'] = (df['gre_quantitative_reasoning'] + df['gre_verbal_reasoning']) / 2
df['gpa_x_acceptancerate'] = df['undergrad_gpa'] * df['acceptance_rate']

### Scale numeric features

In [240]:
numeric_features = ['acceptance_rate', 'undergrad_gpa', 'gre_quantitative_reasoning',
                    'gre_verbal_reasoning', 'analytical_writing', 'gre_total', 'gpa_percentile',
                    'gre_avg', 'gpa_x_acceptancerate']

In [241]:

scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [242]:
# # Step 1: Map unique institutions to their known (non-null) acceptance rates
# institution_acceptance_map = (
#     df[df['acceptance_rate'].notnull()]
#     .groupby('institution')['acceptance_rate']
#     .first()
#     .to_dict()
# )

# # Step 2: Find rows where acceptance_rate is null
# null_acceptance_rows = df[df['acceptance_rate'].isnull()]

# # Step 3 & 4: Fill missing acceptance rates based on institution name
# df.loc[df['acceptance_rate'].isnull(), 'acceptance_rate'] = df.loc[
#     df['acceptance_rate'].isnull(), 'institution'
# ].map(institution_acceptance_map)

# # Optional: Save the updated DataFrame to a new Excel file
# df.to_excel("../data/processed/loaded_data.xlsx")

# df = pd.read_excel("../data/processed/loaded_data.xlsx")

# df = df.dropna(subset=['institution', 'program', 'degree_type', 'decision'])

# df.acceptance_rate.isna()
# 7734

In [243]:
# Drop non-numeric or irrelevant columns
df = df.drop(columns=['notes', 'Unnamed: 0'])

In [244]:
grouped_map = {
    0: 0,  # Accepted
    1: 2,  # Interview → Other
    2: 2,  # Other → Other
    3: 1,  # Rejected
    4: 2   # Wait listed → Other
}
df['decision_grouped'] = df['decision'].map(grouped_map)

In [245]:
df.to_excel("../data/processed/exploratory_analysis.xlsx")

In [246]:
feature_cols = [
    'acceptance_rate',
    'undergrad_gpa',
    'gre_quantitative_reasoning',
    'gre_verbal_reasoning',
    'analytical_writing',
    'gre_total',
    'gpa_percentile',
    'gre_avg',
    'gpa_x_acceptancerate',
    'program',          
    'degree_type',     
    'institution'        
]


# Flow

- Make sure data is distributed evenly
- encode target variable
- use label encoding for "string" data columns
- run feature engineering
    -  to reduce number of features
- check correlation between columns
- run model training
- run eval on test dataset
- create api endpoint for the model