In [None]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

# Load dataset
file_path = "SLU_Opportunity_Data.csv"  # Replace with your actual file
df = pd.read_csv(file_path)

# === Standardize column names ===
df.rename(columns={
    "Date of Birth": "DOB",
    "Learner SignUp DateTime": "SignUp_DateTime",
    "Current/Intended Major": "Major"
}, inplace=True)

# === Handle missing values ===
df["Institution Name"].fillna("Unknown", inplace=True)

# === Remove duplicates ===
df.drop_duplicates(inplace=True)

# === Convert date columns ===
date_cols = ["DOB", "SignUp_DateTime", "Apply Date", "Opportunity Start Date", "Opportunity End Date"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# === Calculate Age ===
df["Age"] = (pd.Timestamp.now() - df["DOB"]).dt.days // 365
df = df[(df["Age"] >= 10) & (df["Age"] <= 100)]  # Filter out extreme outliers

# === Standardize categorical values ===
df["Gender"] = df["Gender"].str.strip().str.title()
df["Gender"].replace({"": "Not Provided", "Na": "Not Provided"}, inplace=True)

df["Major"] = df["Major"].astype(str).str.strip()
df["Major"] = df["Major"].apply(lambda x: x if x.isalpha() or ' ' in x else "Other")

# === Feature Engineering ===

# 1. Engagement Lag (days between Apply Date and Start Date)
df["Engagement_Lag"] = (df["Opportunity Start Date"] - df["Apply Date"]).dt.days

# 2. Opportunity Duration (end - start)
df["Opportunity_Duration"] = (df["Opportunity End Date"] - df["Opportunity Start Date"]).dt.days

# 3. Signup Month & Day of Week
df["Signup_Month"] = df["SignUp_DateTime"].dt.month
df["Signup_Weekday"] = df["SignUp_DateTime"].dt.day_name()

# 4. Normalize Age and Opportunity Duration
scaler = MinMaxScaler()
df[["Norm_Age", "Norm_Opportunity_Duration"]] = scaler.fit_transform(df[["Age", "Opportunity_Duration"]])

# 5. One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=["Gender", "Opportunity Category", "Country"], drop_first=True)

# 6. Composite Engagement Score
df_encoded["Engagement_Score"] = (
    0.4 * df_encoded["Norm_Opportunity_Duration"] +
    0.3 * df_encoded["Norm_Age"] +
    0.3 * df_encoded["Engagement_Lag"].fillna(0)
)

# Save cleaned + engineered dataset
df_encoded.to_csv("Week1_Cleaned_Featured_Engagement.csv", index=False)
print("✅ Week 1 dataset saved as 'Week1_Cleaned_Featured_Engagement.csv'")

# Optional preview
print(df_encoded.head())
