In [None]:
# %% [markdown]
# # Boston Housing Price Prediction - Model Training
# 
# This notebook trains a Linear Regression model on the Boston Housing dataset and saves it as a .pkl file for use in the Flask application.

# %%
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# %%
# Load the dataset
df = pd.read_csv('BostonHousing.csv')

# Display first few rows
df.head()

# %%
# Check dataset information
df.info()

# %%
# Check for missing values
df.isnull().sum()

# %%
# Basic statistics
df.describe()

# %%
# Visualize the distribution of the target variable (price)
plt.figure(figsize=(8, 6))
sns.histplot(df['price'], kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price ($1000s)')
plt.ylabel('Count')
plt.show()

# %%
# Correlation matrix
plt.figure(figsize=(12, 10))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# %%
# Select features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# %%
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# %%
# Make predictions on the test set
y_pred = model.predict(X_test)

# %%
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# %%
# Visualize actual vs predicted prices
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted House Prices')
plt.show()

# %%
# Feature importance
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients)
plt.title('Feature Importance (Coefficients)')
plt.show()

# %%
# Save the model to a .pkl file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved as model.pkl")

# %%
# Sample prediction to test the model
sample_data = X.iloc[0:1]  # Taking first row as sample
sample_prediction = model.predict(sample_data)

print(f"Sample data prediction: {sample_prediction[0]:.2f}")
print(f"Actual price: {y.iloc[0]:.2f}")