In [65]:
#Import necessary libraries 
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [66]:
#Read in data
df = pd.read_csv("texas.csv")
df.head()

Unnamed: 0,region_id,region_type_id,region_name,region_type,period_begin,period_end,duration,total_homes_sold,total_homes_sold_yoy,average_homes_sold,...,average_adjustment_average_homes_sold,adjusted_average_homes_sold,average_adjustment_average_new_listings,adjusted_average_new_listings,average_adjustment_pending_sales,adjusted_pending_sales,adjusted_average_homes_delisted_yoy,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy
0,2691,5,"Crane County, TX",county,2022-08-15,2022-08-21,1 weeks,,,,...,,,,2.0,,1.0,,,,
1,2691,5,"Crane County, TX",county,2022-05-30,2022-08-21,12 weeks,3.0,-0.571429,1.0,...,,1.0,,1.0,,1.0,,0.0,-0.5,0.0
2,2691,5,"Crane County, TX",county,2022-07-25,2022-08-21,4 weeks,,,,...,,,,2.0,,1.0,,,0.0,0.0
3,2712,5,"Falls County, TX",county,2022-08-15,2022-08-21,1 weeks,,,,...,,,,5.0,,2.0,,,0.25,1.0
4,2712,5,"Falls County, TX",county,2022-05-30,2022-08-21,12 weeks,19.0,-0.095238,2.0,...,,2.0,,2.0,,1.0,0.0,1.0,0.0,0.0


In [67]:
# #Print half of the columns
# for i in range(int(len(df.columns) / 2)):
#     print (df.columns[i])

We can't perform linear regression with qualitative data, so first we're going to use the quantitative data. The problem is that there are too many NaN values that the model accuracy becomes very low. If we drop the NaN values, there's not enough data to make a proper model. Instead, we halved the value of quantiative columns, so even though there are Nan values, there's less and a model can actually be made. 

In [68]:
#Only select the quantitative columns
quantitative_columns = df.select_dtypes(include=['int', 'float'])

In [69]:
test_two = quantitative_columns.columns[0:int(len(df.columns) / 2)]
df_mod = quantitative_columns[test_two].dropna()

In [70]:
region_type_id = df_mod.loc[:, "region_type_id"]
total_homes_sold_yoy = df_mod.loc[:, "total_homes_sold_yoy"]
median_sale_price = df_mod.loc[:, "median_sale_price"]


In [71]:
# #just an example to see stats using the describe function
# df[df["region_name"]=="crane county, TX"].describe()

You can choose any column to put here. Basically we're comapring every other varible in the df_mod data and seeing it's relationshop to the target varible, for example the median sales price of homes. 

In [72]:
#Select a target variable
y = df_mod['median_sale_price']  
X = df_mod.drop(['median_sale_price'], axis=1)  

In [73]:
# 75% train set, 25% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [74]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

### This model performs well, yay!

In [75]:
print('model coefficients:', lr.coef_, '\n')
# R^2 is a measure of how well your model does at predicting the target from the features. 
print('R^2:', lr.score(X_train, y_train))  
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

model coefficients: [-1.52027576e-01 -3.03218030e+02  5.42151402e-01 -1.11608884e+04
  1.13787802e+02  6.34369730e+03  3.05193388e+00 -1.20762544e+02
 -2.39295419e+02  1.31609172e+02  5.02004602e+04 -3.48264934e+04
  8.69360868e+04  2.08476289e+03 -1.27000035e+05 -3.90008275e+02
  3.22367408e+02  5.29648569e+00  5.32959079e+02  2.53451193e+04
 -3.10201903e+04  1.57107021e+02  4.26404603e+03  9.20671367e+01
 -9.26320439e+04 -2.41808555e+02 -1.20344229e+03 -2.83720851e+01
  7.54800673e+02  3.36147474e+04  8.32380325e+03 -1.90031062e+04
 -2.63151474e+04 -2.89446295e-02 -5.19848115e+02 -1.97056733e+01
  9.26503703e+02  1.42500857e-01 -3.18198006e+04 -5.50839247e+02
  6.44951063e+04 -2.28807744e+01  4.04723690e+04 -1.36117383e-01
 -2.27429916e+05  1.93672024e+01  1.81328914e+05 -6.93789864e+01
  7.67656677e+01  2.05002937e+01] 

R^2: 0.9122608095966863
Mean Squared Error: 514443952.1293837


### Linear Regression Model after doing One-Hot Encoding

In [76]:
# Separate the target variable (median_sale_price)
target_variable = data['median_sale_price']

# Drop the target variable to only process the features
data_without_target = data.drop(columns=['median_sale_price'])

In [77]:
# Identify categorical columns and perform one-hot encoding
categorical_columns = data_without_target.select_dtypes(include=['object']).columns.tolist()
one_hot_data = pd.get_dummies(data_without_target, columns=categorical_columns, drop_first=True)

# Extract the total number of columns after one-hot encoding
all_columns = one_hot_data.columns.tolist()
#print(all_columns)

In [78]:
# Determine the number of new columns created by one-hot encoding
num_new_columns = len(all_columns) - len(data_without_target.columns)

In [79]:
# Select half of the newly created columns
start_index = len(data_without_target.columns)  # Where new columns start
half_point = start_index + num_new_columns // 2

selected_columns = all_columns[start_index:half_point]

In [80]:
# Create a new DataFrame with the selected columns
half_one_hot_data = one_hot_data[selected_columns]

In [81]:
# Reintroduce the target variable
final_data = pd.concat([half_one_hot_data, target_variable], axis=1)

# Fix NaN in the target variable by dropping rows with NaN
final_data = final_data.dropna(subset=['median_sale_price'])

In [82]:
# Extract the target variable and features
y = final_data['median_sale_price']
X = final_data.drop(['median_sale_price'], axis=1)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [83]:
# Fit a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Mean Squared Error: 5997348969.295045
R-squared: 0.5365529168996392
