In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
# === Load Dataset ===
df = pd.read_excel("../data/processed/preprocessed_final.xlsx")

In [6]:
# === Load and Clean QS World Ranking Data ===
qs_df = pd.read_csv("../data/raw/qs_world_ranking_university.csv")
qs_df_clean = qs_df.iloc[1:].copy()
qs_df_clean.rename(columns={"Institution Name": "institution", "2024 RANK": "qs_rank"}, inplace=True)
qs_df_clean["qs_rank"] = pd.to_numeric(qs_df_clean["qs_rank"], errors="coerce")

In [7]:
df["institution"] = df["institution"].astype(str).str.strip()
qs_df_clean["institution"] = qs_df_clean["institution"].astype(str).str.strip()

In [8]:
# Merge QS data
df = df.merge(qs_df_clean[["institution", "qs_rank"]], on="institution", how="left")
df["qs_rank_score"] = -df["qs_rank"]

In [9]:
# Assign QS Tier Buckets
def assign_qs_tier(rank):
    if pd.isna(rank):
        return 4
    elif rank <= 50:
        return 0
    elif rank <= 100:
        return 1
    elif rank <= 300:
        return 2
    elif rank <= 800:
        return 3
    else:
        return 4


In [10]:
df["qs_tier"] = df["qs_rank"].apply(assign_qs_tier)

In [11]:
# === Feature Engineering ===
df['gpa_percentile'] = df['undergrad_gpa'] / 4.0  # GPA normalized
df['gre_avg'] = (df['gre_quantitative_reasoning'] + df['gre_verbal_reasoning']) / 2
df['gpa_x_acceptancerate'] = df['undergrad_gpa'] * df['acceptance_rate']
df["application_strength"] = df["undergrad_gpa"] * df["acceptance_rate"]
df["tier_combo"] = df["institution"].astype(str) + "_" + df["degree_type"].astype(str)
df["tier_score"] = LabelEncoder().fit_transform(df["tier_combo"])

In [12]:
# === Label Encoding for Categorical Columns ===
categorical_columns = ['institution', 'program', 'degree_type', 'decision']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [13]:
# Print decision label mapping
le = label_encoders['decision']
print("Label mapping for 'decision':")
for idx, label in enumerate(le.classes_):
    print(f"  {label} => {idx}")

Label mapping for 'decision':
  Accepted => 0
  Interview => 1
  Other => 2
  Rejected => 3
  Wait listed => 4


In [14]:
# === Scaling Numerical Features ===
numeric_features = ['acceptance_rate', 'undergrad_gpa', 'gre_quantitative_reasoning',
                    'gre_verbal_reasoning', 'analytical_writing', 'gre_total', 'gpa_percentile',
                    'gre_avg', 'gpa_x_acceptancerate', 'application_strength']

In [15]:
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [16]:
# === Drop Unnecessary Columns ===
df = df.drop(columns=['notes', 'Unnamed: 0'])

In [17]:
# === Group Target Labels ===
grouped_map = {
    0: 0,  # Accepted
    1: 2,  # Interview → Other
    2: 2,  # Other → Other
    3: 1,  # Rejected
    4: 2   # Wait listed → Other
}

In [18]:
df['decision_grouped'] = df['decision'].map(grouped_map)

In [19]:
# === Save Final Enhanced Dataset ===
df.to_excel("../data/processed/exploratory_analysis_final.xlsx", index=False)

In [20]:
# === Final Feature List ===
feature_cols = [
    'acceptance_rate',
    'undergrad_gpa',
    'gre_quantitative_reasoning',
    'gre_verbal_reasoning',
    'analytical_writing',
    'gre_total',
    'gpa_percentile',
    'gre_avg',
    'gpa_x_acceptancerate',
    'application_strength',
    'qs_rank',
    'qs_rank_score',
    'qs_tier',
    'tier_score',
    'program',
    'degree_type',
    'institution'
]
