In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
df = pd.read_csv("data.csv")

# Drop unnecessary columns
df = df.drop(columns=['date', 'street', 'country'])

# Define target and features
X = df.drop(columns=['price'])
y = df['price']

# Identify categorical and numerical columns
categorical_cols = ['city', 'statezip']
numerical_cols = X.drop(columns=categorical_cols).columns.tolist()

# Column transformer for encoding categorical data
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

# Define the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, 'linear_regression_model.pkl')

print("Model trained and saved as 'linear_regression_model.pkl'")


Model trained and saved as 'linear_regression_model.pkl'
