In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle

# Step 2: Load the Dataset
df = pd.read_csv('dataset/education_career_success_revised_job_status.csv')

# Step 3: Preprocessing
# Convert 'Job_Status' to a binary target variable: 1 for 'Employed', 0 otherwise
df['Job_Status_Binary'] = df['Job_Status'].apply(lambda x: 1 if x == 'Yes' else 0)

# Drop original 'Job_Status' and other columns that won't be used as features
df = df.drop(columns=['Job_Status'])

# Handle categorical features
# We treat 'Certifications' as categorical here since it has a limited number of unique values
categorical_cols = ['Gender', 'Field_of_Study', 'Certifications', 'Current_Job_Level']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True) # drop_first avoids multicollinearity

# Define features (X) and target (y)
X = df_encoded.drop('Job_Status_Binary', axis=1)
y = df_encoded['Job_Status_Binary']

# Save the column order for the Flask app to use
columns = X.columns.tolist()
with open('columns.pkl', 'wb') as f:
    pickle.dump(columns, f)

# Step 4: Scale Numerical Features
numerical_cols = ['University_Ranking', 'Updated_University_GPA', 'Internships_Completed',
                  'Projects_Completed', 'Soft_Skills_Score', 'Job_Offers']
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Save the scaler for the Flask app
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Step 5: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Evaluate the Model (Optional but recommended)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Step 8: Save the Model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model training complete. model.pkl, columns.pkl, and scaler.pkl saved to disk.")

Model Accuracy: 1.00
Model training complete. model.pkl, columns.pkl, and scaler.pkl saved to disk.
