In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# college completion pipeline

def load_college_data():
    return pd.read_csv("cc_institution_details.csv")

def create_college_target(df):
    df = df.dropna(subset=["grad_100_value"])
    threshold = df["grad_100_value"].quantile(.75)
    df["target"] = (df["grad_100_value"] >= threshold).astype(int)
    print("College Target Prevalence:", df["target"].mean())
    return df

def prep_college_features(df):
    
    # Drop grad_100_value and instnm
    df = df.drop(columns=["grad_100_value", "instnm"], errors="ignore")
    
    # change categoricals
    cat_cols = df.select_dtypes(include=["object", "string"]).columns
    df[cat_cols] = df[cat_cols].astype("category")
    
    # one hot encoding
    df = pd.get_dummies(df, drop_first=True)
    
    # normalize numeric columns
    scaler = MinMaxScaler()
    num_cols = df.select_dtypes("number").columns
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df

def split_college_data(df):
    X = df.drop("target", axis=1)
    y = df["target"]
    return train_test_split(X, y, test_size=.2, random_state=42)

# Run college pipeline
college = load_college_data()
college = create_college_target(college)
college = prep_college_features(college)
Xc_train, Xc_test, yc_train, yc_test = split_college_data(college)

# Job placement pipeline

def load_job_data():
    return pd.read_csv("Placement_Data_Full_Class.csv")

def create_job_target(df):
    df["target"] = (df["status"] == "Placed").astype(int)
    print("Job Placement Target Prevalence:", df["target"].mean())
    return df

def prep_job_features(df):
    
    # Drop status and sl_no
    df = df.drop(columns=["status", "sl_no"], errors="ignore")
    
    # change categoricals
    cat_cols = df.select_dtypes(include=["object", "string"]).columns
    df[cat_cols] = df[cat_cols].astype("category")
    
    # One hot encoding
    df = pd.get_dummies(df, drop_first=True)
    
    # Normalize numeric columns
    scaler = MinMaxScaler()
    num_cols = df.select_dtypes("number").columns
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df

def split_job_data(df):
    X = df.drop("target", axis=1)
    y = df["target"]
    return train_test_split(X, y, test_size=.2, random_state=42)

# Run job placement pipeline
job = load_job_data()
job = create_job_target(job)
job = prep_job_features(job)
Xj_train, Xj_test, yj_train, yj_test = split_job_data(job)


print("pipelines finished")


College Target Prevalence: 0.2500721084511105
Job Placement Target Prevalence: 0.6883720930232559
pipelines finished
