In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib

# -------------------
# Load your dataset
# -------------------
df = pd.read_csv("Salary_Data.csv")

# Target column
y = df["Salary"]

# Features (everything except Salary)
X = df.drop("Salary", axis=1)

# Identify columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(exclude=["object"]).columns

# -------------------
# Build preprocessing + model pipeline
# -------------------
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder="passthrough"   # numeric columns go as-is
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

# -------------------
# Trainâ€“test split
# -------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------
# Fit model
# -------------------
model.fit(X_train, y_train)

# -------------------
# Save trained model
# -------------------
joblib.dump(model, "salary_model.pkl")

print("Model saved successfully!")
