In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from path import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sqlalchemy import create_engine

import psycopg2
# import the psycopg2 database adapter for PostgreSQL
from psycopg2 import connect, extensions, sql

In [2]:
import os,sys
parentdir = Path(os.path.abspath("../.."))
sys.path.insert(0,parentdir)

# Get the DB password
from config import db_password

In [3]:
# Create Engine for covid_property_pandemic DB
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/covid_property_pandemic"
engine = create_engine(db_string)
conn = engine.connect()

In [5]:
## Get Data from Housing Table

sql_str = 'SELECT * FROM "FL_CA_Housing_Data"'

df = pd.read_sql(sql_str,conn)
df.drop(columns=['index','Unnamed: 0'],axis=1,inplace=True)
print(df.shape)
df.sample(n=10)


(328248, 6)


Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
240250,San Geronimo,CA,San Francisco-Oakland-Hayward,Marin County,2013-09-30,760518
255286,University,FL,Tampa-St. Petersburg-Clearwater,Hillsborough County,2014-11-30,85038
230462,North Lauderdale,FL,Miami-Fort Lauderdale-West Palm Beach,Broward County,2013-01-31,111011
224793,Eastvale,CA,Riverside-San Bernardino-Ontario,Riverside County,2012-08-31,382354
80706,La Crescenta-Montrose,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,2001-12-31,358950
78335,Norco,CA,Riverside-San Bernardino-Ontario,Riverside County,2001-10-31,265949
68908,Tallahassee,FL,Tallahassee,Leon County,2001-02-28,122467
231797,Palm Springs,FL,Miami-Fort Lauderdale-West Palm Beach,Palm Beach County,2013-02-28,102561
105000,Cortez,FL,North Port-Sarasota-Bradenton,Manatee County,2003-09-30,298020
127339,Freeport,FL,Crestview-Fort Walton Beach-Destin,Walton County,2005-05-31,268987


In [6]:
FL_df = df.loc[df['State'] == 'FL']
print(FL_df.shape)
FL_df.sample(n=10)

(142008, 6)


Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
90871,West Melbourne,FL,Palm Bay-Melbourne-Titusville,Brevard County,2002-09-30,149803
46775,Westchase,FL,Tampa-St. Petersburg-Clearwater,Hillsborough County,1999-06-30,160841
138387,Gladeview,FL,Miami-Fort Lauderdale-West Palm Beach,Miami-Dade County,2006-03-31,177369
26523,Glenvar Heights,FL,Miami-Fort Lauderdale-West Palm Beach,Miami-Dade County,1997-12-31,243335
72442,Deerfield Beach,FL,Miami-Fort Lauderdale-West Palm Beach,Broward County,2001-05-31,129249
231422,Palm Coast,FL,Deltona-Daytona Beach-Ormond Beach,Flagler County,2013-02-28,144160
183587,Pembroke Park,FL,Miami-Fort Lauderdale-West Palm Beach,Broward County,2009-07-31,113509
31416,Lake Belvedere Estates,FL,Miami-Fort Lauderdale-West Palm Beach,Palm Beach County,1998-04-30,114603
79597,Yulee,FL,Jacksonville,Nassau County,2001-11-30,137142
181424,South Pasadena,FL,Tampa-St. Petersburg-Clearwater,Pinellas County,2009-05-31,193300


In [20]:
## Filter out the Dataset for prices between 2010 and  2019
FL_df.dtypes


RegionName    object
State         object
Metro         object
CountyName    object
Date          object
Avg_Price      int64
dtype: object

In [23]:
# Date column is of type String. Convert it to datetime first
FL_df['Date'] = pd.to_datetime(FL_df['Date'])
FL_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FL_df['Date'] = pd.to_datetime(FL_df['Date'])


RegionName            object
State                 object
Metro                 object
CountyName            object
Date          datetime64[ns]
Avg_Price              int64
dtype: object

In [29]:
start_date_train = '1/1/2010'
end_date_train = '12/31/2019'

In [30]:
mask = (FL_df['Date'] > start_date_train) & (FL_df['Date'] <= end_date_train)
FL_df_train = FL_df.loc[mask]
FL_df_train.head()

Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
189595,Jacksonville,FL,Jacksonville,Duval County,2010-01-31,146582
189597,Orlando,FL,Orlando-Kissimmee-Sanford,Orange County,2010-01-31,155102
189600,Miami,FL,Miami-Fort Lauderdale-West Palm Beach,Miami-Dade County,2010-01-31,223207
189603,Tampa,FL,Tampa-St. Petersburg-Clearwater,Hillsborough County,2010-01-31,151244
189608,Naples,FL,Naples-Immokalee-Marco Island,Collier County,2010-01-31,278643


In [31]:
start_date_test = '1/1/2020'
end_date_test = '12/31/2020'

In [32]:
mask = (FL_df['Date'] > start_date_test) & (FL_df['Date'] <= end_date_test)
FL_df_test = FL_df.loc[mask]
FL_df_test

Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
324871,Jacksonville,FL,Jacksonville,Duval County,2020-01-31,197045
324873,Orlando,FL,Orlando-Kissimmee-Sanford,Orange County,2020-01-31,267339
324876,Miami,FL,Miami-Fort Lauderdale-West Palm Beach,Miami-Dade County,2020-01-31,395652
324879,Tampa,FL,Tampa-St. Petersburg-Clearwater,Hillsborough County,2020-01-31,254874
324884,Naples,FL,Naples-Immokalee-Marco Island,Collier County,2020-01-31,398002
...,...,...,...,...,...,...
328240,Captiva,FL,Cape Coral-Fort Myers,Lee County,2020-03-31,2312726
328243,Homeland,FL,Lakeland-Winter Haven,Polk County,2020-03-31,128242
328244,Okahumpka,FL,Orlando-Kissimmee-Sanford,Lake County,2020-03-31,93233
328245,Hillcrest Heights,FL,Lakeland-Winter Haven,Polk County,2020-03-31,180944


In [35]:
# Get the model ready
import numpy as np
X = np.array(FL_df_train['Date']).reshape(-1,1)
y = np.array(FL_df_train['Avg_Price']).reshape(-1,1)

In [36]:
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

TypeError: invalid type promotion

In [7]:
CA_df = df.loc[df['State'] == 'CA']
print(CA_df.shape)
CA_df.sample(n=10)

(186240, 6)


Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
254705,Del Rey,CA,Fresno,Fresno County,2014-10-31,148486
281309,Stanton,CA,Los Angeles-Long Beach-Anaheim,Orange County,2016-10-31,498022
127466,Belvedere,CA,San Francisco-Oakland-Hayward,Marin County,2005-05-31,1957855
66024,Nipomo,CA,San Luis Obispo-Paso Robles-Arroyo Grande,San Luis Obispo County,2000-11-30,293151
32994,Colton,CA,Riverside-San Bernardino-Ontario,San Bernardino County,1998-06-30,94502
171859,Newark,CA,San Francisco-Oakland-Hayward,Alameda County,2008-09-30,476738
325821,Ross,CA,San Francisco-Oakland-Hayward,Marin County,2020-01-31,2902280
171606,Hayward,CA,San Francisco-Oakland-Hayward,Alameda County,2008-09-30,388390
39713,Lynwood,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1998-12-31,142807
261704,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,2015-05-31,1110157


In [None]:
#Target Variable is 2020 Price. We want to see the relationship between Year and price
plt.scatter(df.Date, df.price)
plt.xlabel("Months")
plt.ylabel("Housing price USD $")
plt.show()

In [None]:
#Declaring features and targets

X = df.drop("Target_price")
y=df.Target_price

In [None]:
#instantized and fit data to model and predictions
model = LinearRegression()

model.fit(X, y)

y_pred = model.predict(X)

In [None]:
#Print the coeffiecient and intercept of the nodel

print(model.coef_)
print(model.intercept_)

In [None]:
#Print regression line

plt.scatter(X,y)
plt.plot(X, y_pred, color="red")
plt.show()