In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset (replace this with the actual dataset path)
file_path = '/mnt/data/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv'
zillow_data = pd.read_csv(file_path)

# Data Preparation: using 2024-08-31 housing prices
zillow_data_latest = zillow_data[['RegionName', 'StateName', 'SizeRank', '2024-08-31']].dropna()

# Rename columns for easier understanding
zillow_data_latest.columns = ['Region', 'State', 'SizeRank', 'Price']

# Encode categorical variables (Region, State)
le_region = LabelEncoder()
le_state = LabelEncoder()
zillow_data_latest['Region'] = le_region.fit_transform(zillow_data_latest['Region'])
zillow_data_latest['State'] = le_state.fit_transform(zillow_data_latest['State'])

# Define features (X) and target (y)
X = zillow_data_latest[['Region', 'State', 'SizeRank']]
y = zillow_data_latest['Price']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2 Score): {r2}')

# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.coef_
})

# Display feature importance
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.show()

# Visualization: Scatter plot of actual vs predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Housing Prices')
plt.show()
