In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [11]:
# Load your main dataset
df = pd.read_excel("../data/processed/exploratory_analysis.xlsx")

In [12]:
# Load and clean QS data
qs_df = pd.read_csv("../data/raw/qs_world_ranking_university.csv")
qs_df_clean = qs_df.iloc[1:].copy()  # remove internal header row
qs_df_clean.rename(columns={"Institution Name": "institution", "2024 RANK": "qs_rank"}, inplace=True)
qs_df_clean["qs_rank"] = pd.to_numeric(qs_df_clean["qs_rank"], errors="coerce")

In [13]:
# Ensure matching column types for merge
df["institution"] = df["institution"].astype(str).str.strip()
qs_df_clean["institution"] = qs_df_clean["institution"].astype(str).str.strip()

In [14]:
# Merge QS rank into main dataset
df = df.merge(qs_df_clean[["institution", "qs_rank"]], on="institution", how="left")

In [15]:
# Add QS score feature (higher = better rank)
df["qs_rank_score"] = -df["qs_rank"]

In [16]:
# Add QS tier buckets
def assign_qs_tier(rank):
    if pd.isna(rank):
        return 4
    elif rank <= 50:
        return 0
    elif rank <= 100:
        return 1
    elif rank <= 300:
        return 2
    elif rank <= 800:
        return 3
    else:
        return 4

In [17]:
df["qs_tier"] = df["qs_rank"].apply(assign_qs_tier)

In [18]:
# Add application_strength = GPA × acceptance_rate
df["application_strength"] = df["undergrad_gpa"] * df["acceptance_rate"]

In [19]:
# Add tier_score = institution × degree_type (encoded combo)
df["tier_combo"] = df["institution"].astype(str) + "_" + df["degree_type"].astype(str)
df["tier_score"] = LabelEncoder().fit_transform(df["tier_combo"])

In [20]:
# Save final dataset
df.to_excel("../data/processed/qs_and_exploratory_combined.xlsx", index=False)