In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

# Load dataset
df = pd.read_csv('pd2.csv')

# --- NEW: RE-BALANCING LOGIC ---
# Find the career with the fewest rows to set the balance point
min_count = df['career'].value_counts().min()
balanced_df = df.groupby('career').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)
print(f"Dataset re-balanced! Each career now has {min_count} rows.")

# Preprocessing Multi-Label Columns
def clean_list(x): return [i.strip() for i in str(x).split(',')]
balanced_df['interests_list'] = balanced_df['interests'].apply(clean_list)
balanced_df['strengths_list'] = balanced_df['strengths'].apply(clean_list)

mlb_int = MultiLabelBinarizer()
int_enc = mlb_int.fit_transform(balanced_df['interests_list'])
mlb_str = MultiLabelBinarizer()
str_enc = mlb_str.fit_transform(balanced_df['strengths_list'])

# Categorical Features (One-Hot)
cat_features = ['education', 'field', 'learning_rate', 'time_horizon', 'risk_tolerance', 'career']
df_cat = pd.get_dummies(balanced_df[cat_features])

# Prepare X and y
X = pd.concat([df_cat, 
               pd.DataFrame(int_enc, columns=[f"interest_{c}" for c in mlb_int.classes_]),
               pd.DataFrame(str_enc, columns=[f"strength_{c}" for c in mlb_str.classes_])], axis=1)
y = balanced_df['fit_score']

# Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=200, max_depth=25, random_state=42)
model.fit(X_train, y_train)

# Save Assets
joblib.dump(model, 'career_fit_regressor.joblib', compress=9)
joblib.dump(mlb_int, 'interests_binarizer.joblib')
joblib.dump(mlb_str, 'strengths_binarizer.joblib')
joblib.dump(X.columns.tolist(), 'model_features.joblib')
joblib.dump(balanced_df['career'].unique().tolist(), 'career_list.joblib')
print("Re-balanced model saved successfully!")

  balanced_df = df.groupby('career').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)


Dataset re-balanced! Each career now has 1000 rows.
Re-balanced model saved successfully!
