In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [64]:
#Load State Data
state_data = pd.read_csv('/Users/timwillard/Desktop/group3-project2-ml/tw/Zillow/State_zhvi.csv')

In [65]:
#Check for Missing Values
print(state_data.isnull().sum())

RegionID       0
SizeRank       0
RegionName     0
RegionType     0
StateName     51
              ..
2024-03-31     0
2024-04-30     0
2024-05-31     0
2024-06-30     0
2024-07-31     0
Length: 300, dtype: int64


In [66]:
#Drop Unneccessary Columns
state_data_cleaned = state_data.drop(columns=['RegionID', 'SizeRank', 'RegionType', 'StateName'])
print(state_data_cleaned.head())

     RegionName     2000-01-31     2000-02-29     2000-03-31     2000-04-30  \
0    California  191545.840185  192189.682926  193059.028286  194946.117850   
1         Texas  112965.763960  113027.152474  113056.926865  113204.699809   
2       Florida  107876.988002  108110.659600  108393.916155  108969.703876   
3      New York  155482.537043  156036.207607  156569.204812  157715.536908   
4  Pennsylvania   99875.274333  100088.899981  100289.650707  100698.750181   

      2000-05-31     2000-06-30     2000-07-31     2000-08-31     2000-09-30  \
0  197139.575194  199506.309297  202005.749556  204615.297172  207230.900575   
1  113300.399884  113407.702966  113478.217693  113686.827964  113960.389530   
2  109590.628885  110224.172449  110860.728771  111509.028745  112180.996279   
3  158901.580849  160184.022348  161336.932702  162346.109844  163242.860648   
4  101118.997774  101550.898609  102009.550658  102447.383700  102893.494537   

   ...     2023-10-31     2023-11-30     202

In [67]:
#Handle Missing Data
missing_data = state_data_cleaned.isnull().sum()
print(missing_data[missing_data>0])

2000-01-31    4
2000-02-29    4
2000-03-31    4
2000-04-30    4
2000-05-31    4
             ..
2008-10-31    2
2008-11-30    1
2008-12-31    1
2012-05-31    1
2019-03-31    1
Length: 110, dtype: int64


In [68]:
#Drop Missiong Values
state_data_cleaned = state_data_cleaned.dropna()

In [69]:
#Reshape the data
state_data_long = pd.melt(state_data_cleaned, id_vars=['RegionName'], var_name='Date', value_name='Price')
state_data_long['Date']= pd.to_datetime(state_data_long['Date'])
print(state_data_long.head())

     RegionName       Date          Price
0    California 2000-01-31  191545.840185
1         Texas 2000-01-31  112965.763960
2       Florida 2000-01-31  107876.988002
3      New York 2000-01-31  155482.537043
4  Pennsylvania 2000-01-31   99875.274333


In [70]:
# Round the 'Price' column to 2 decimal places
state_data_long['Price'] = state_data_long['Price'].round(0)
print(state_data_long.head())

     RegionName       Date     Price
0    California 2000-01-31  191546.0
1         Texas 2000-01-31  112966.0
2       Florida 2000-01-31  107877.0
3      New York 2000-01-31  155483.0
4  Pennsylvania 2000-01-31   99875.0


In [71]:
# Extracting year and month from the 'Date' column
state_data_long['Year'] = state_data_long['Date'].dt.year
state_data_long['Month'] = state_data_long['Date'].dt.month
print(state_data_long.head())

     RegionName       Date     Price  Year  Month
0    California 2000-01-31  191546.0  2000      1
1         Texas 2000-01-31  112966.0  2000      1
2       Florida 2000-01-31  107877.0  2000      1
3      New York 2000-01-31  155483.0  2000      1
4  Pennsylvania 2000-01-31   99875.0  2000      1


In [72]:
from sklearn.model_selection import train_test_split

In [73]:
#Splitting data into features and target
X = state_data_long[['Year', 'Month', 'RegionName']]
y =state_data_long['Price']

In [80]:
# Convert categorical variable 'RegionName' to dummy/indicator variables
X = pd.get_dummies(X, columns=['RegionName'], drop_first=True)

In [81]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the training and testing sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(9912, 43) (2478, 43) (9912,) (2478,)


In [82]:
from sklearn.linear_model import LinearRegression

In [83]:
#Initialize the Model
model = LinearRegression()

In [84]:
# Remove rows where y_train has NaN values
valid_indices = ~y_train.isna()
X_train = X_train[valid_indices]
y_train = y_train[valid_indices]

In [85]:
#Train the model on training data
model.fit(X_train, y_train)

In [86]:
# Fill NaN values in y_train with the median
y_train = y_train.fillna(y_train.median())

# Train the model
model.fit(X_train, y_train)

In [None]:
# from sklearn.impute import SimpleImputer

# # Impute missing values in y_train
# imputer = SimpleImputer(strategy='median')  # You can also try 'mean', 'most_frequent', etc.
# y_train = imputer.fit_transform(y_train.values.reshape(-1, 1)).ravel()

# # Train the model
# model.fit(X_train, y_train)

In [87]:
from sklearn.metrics import mean_squared_error, r2_score

In [88]:
#Make Predictions on the test set of data
y_pred = model.predict(X_test)

In [89]:
# Check for NaN values in y_test and y_pred
print("NaN values in y_test:", y_test.isna().sum())
print("NaN values in y_pred:", np.isnan(y_pred).sum())

NaN values in y_test: 0
NaN values in y_pred: 0


In [90]:
# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-Squared Score: {r2}")

Mean Squared Error: 2488564820.745668
R-Squared Score: 0.8121571588062075


In [91]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

In [92]:
#Train the model on training data
rf_model.fit(X_train, y_train)

In [93]:
# Make predictions on the test set
rf_y_predictins = rf_model.predict(X_test)

In [94]:
# Calculate the mean squared error
rf_mse = mean_squared_error(y_test, rf_y_predictins)

# Calculate the R-squared score
r2_rf = r2_score(y_test, rf_y_predictins)

# Print the evaluation metrics
print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R-Squared Score: {r2_rf}")

Random Forest Mean Squared Error: 5849157.9080089955
Random Forest R-Squared Score: 0.9995584915326006


In [95]:
from sklearn.linear_model import Lasso

In [96]:
# Initialize Lasso Regression model
lasso = Lasso(alpha=1.0) 

# Fit the model to the training data
lasso.fit(X_train, y_train)

In [98]:
lasso_y_pred = lasso.predict(X_test)


In [99]:
mse = mean_squared_error(y_test, lasso_y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-Squared Score: {r2}")

Mean Squared Error: 2488563315.6778226
R-Squared Score: 0.8121572724123383
