In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [None]:
filePath = "/Users/keshavsaraogi/data/socal.csv"
data = pd.read_csv(filePath)

In [None]:
print(data.head())
print(data.info())
print(data.describe())

In [None]:
print(data.isnull().sum())

In [None]:
sns.histplot(data['price'], kde = True, bins = 30)
plt.title("Distribution of Sales Price")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

In [None]:
# selecting only numeric columns for correlation

numeric_data = data.select_dtypes(include=['number'])

In [None]:
correlation_matrix = numeric_data.corr()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot = True, cmap = "coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
X = numeric_data.drop('price', axis = 1)
Y = numeric_data['price']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = RandomForestRegressor(n_estimators = 100, random_state = 42)
model.fit(X_train_scaled, Y_train)

In [None]:
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, y_pred)

In [None]:
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, Y_train)

y_pred = linear_model.predict(X_test_scaled)

In [None]:
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, y_pred)

In [None]:
print(f"Linear Regression Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")