In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
import os


def basic_cleaning(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.drop_duplicates(inplace=True)
    num_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(exclude=[np.number]).columns

    if len(num_cols) > 0:
        df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    for col in cat_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mode().iloc[0])
    return df


# --- CONFIG: update before running ---
CSV_PATH = r"C:\Users\Admin\Desktop\Harshvardhan Project\.venv\CSV\job_market_trends.csv"
TARGET = "trend_category"  # change to your target column
OUT_PATH = "trained_model.pkl"

# --- Run training ---
print("Loading CSV:", CSV_PATH)
df = pd.read_csv(CSV_PATH)
df = basic_cleaning(df)

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in CSV columns: {list(df.columns)}")

le = LabelEncoder()
df["target_encoded"] = le.fit_transform(df[TARGET])

# Select numeric features by default (excluding target)
features = [c for c in df.select_dtypes(include=[np.number]).columns if c not in [TARGET, "target_encoded"]]
if not features:
    raise ValueError("No numeric features found. Provide explicit feature columns.")

X = df[features]
y = df["target_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

artifact = {"model": model, "label_encoder": le, "features": features}
joblib.dump(artifact, OUT_PATH)
print(f"Saved trained artifact to: {OUT_PATH}")


# Training-only notebook

This notebook was converted to a minimal training-only workflow.

- Loads a CSV
- Performs basic cleaning
- Encodes the target
- Trains a RandomForestClassifier
- Saves an artifact containing the model, label encoder and features

Update the CSV path and target variable below before running.