# Final Project: House Price Prediction
## Corpus Christi Team
### Step 4 (Save and read to/from the Data Base)

In [1]:
# Import libraries
import psycopg2
import psycopg2.extras
import time
import re
import os
import glob
import csv
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

from config import db as dbpwd

In [2]:
# Data base connection details
hostname = 'localhost'
database = 'real_estate_data_test'
username = 'postgres'
pwd = dbpwd
port_id = 5432
conn = None
cur = None

State = 'TX'
City = 'Houston'

In [3]:
# Define the input data path and input file name
data_path = "../03_Data_Cleaning/data/*.csv"
listing = glob.glob(data_path)

for file in listing:
    if '03b_preprocessed_data' in file:
        input_file = file
        print(f'The input file is {file}')

# Define the table name to be saved in the DB
#regex = "(03b.+).csv"
regex = "(data_.+).csv"
match = re.findall(rf"{regex}", input_file)
name = ''.join(match)
table_name = name.lower()
print(f'The input file will be saved to the db as a table named: {table_name}')

The input file is ../03_Data_Cleaning/data\03b_preprocessed_data_Houston_TX_Houses_merged_2023_1_8_12_2.csv
The input file will be saved to the db as a table named: data_houston_tx_houses_merged_2023_1_8_12_2


In [4]:
# Convert the input data to a pandas DF for QC
df = pd.read_csv(input_file)
df.head(3)

Unnamed: 0,zid,Address,Lat,Lng,Price,Image,Bedrooms,Bathrooms,lotArea,constructedArea,zipCode,CountyId,taxRate,hasGarage,hasPool,hasCooling,hasView,yearBuilt,stories,parkingSpaces,annualHOI,annualHOA,schoolElemRating,schoolElemDist,schoolMidRating,schoolMidDist,schoolHighRating,schoolHighDist,priceIncreased,priceDiff,days,lotAreaSQFT,avgSchoolRating,avgSchoolDist
0,27541924,"6602 Indian Lake Dr, Missouri City, TX 77489",29.6036,-95.485405,276000,https://photos.zillowstatic.com/fp/bc5a7196226...,3.0,2,8764.272,3612.0,77489,2698,2.41,1,0,1,0,1971.0,2.0,2,1159,$400 annually,6.0,0.5,3,1.3,2,1.1,1,31000,1213,8764.272,3.666667,0.966667
1,27542111,"6727 Castleview Ln, Missouri City, TX 77489",29.600388,-95.48821,220000,https://photos.zillowstatic.com/fp/1336380a837...,3.0,2,7884.36,1270.0,77489,2698,2.41,1,0,1,0,1972.0,1.0,1,924,$144 annually,6.0,0.3,3,1.3,2,1.0,1,65100,503,7884.36,3.666667,0.866667
2,27542455,"15902 Ridgerock Rd, Missouri City, TX 77489",29.595436,-95.48588,230000,https://photos.zillowstatic.com/fp/5603ce32232...,4.0,2,7148.196,1765.0,77489,2698,2.41,1,0,1,0,1974.0,1.0,2,966,$250 annually,6.0,0.3,3,1.0,2,0.6,1,115275,5140,7148.196,3.666667,0.633333


#### Save the table to the DB

In [5]:
try:
    #connect to the database
    conn = psycopg2.connect(host=hostname,
                           dbname=database,
                           user=username,
                           password=pwd,
                           port=port_id)  

    #create a cursor object: it is used to interact with the database
    cur = conn.cursor()
    
    # Remove the table only if iti exists
    cur.execute('DROP TABLE IF EXISTS {}'.format(table_name))

    #create a DB table with same headers as csv file
    create_script = '''CREATE TABLE IF NOT EXISTS {} (
    zid INT PRIMARY KEY,
    State VARCHAR (2),
    City VARCHAR (30),
    Address VARCHAR(90),
    Lat FLOAT,
    Lng FLOAT,
    Price INT NOT NULL,
    Image VARCHAR(300),
    Bedrooms INT,
    Bathrooms INT,
    lotArea FLOAT,
    constructedArea FLOAT,
    zipCode INT NOT NULL,
    CountyID INT,
    taxRate FLOAT,
    hasGarage INT,
    hasPool INT,
    hasCooling INT,
    hasView INT,
    yearBuilt INT,
    stories INT,
    parkingSpaces INT,
    annualHOI INT,
    annualHOA varchar(20),
    schoolElemRating INT,
    schoolElemDist FLOAT,
    schoolMidRating INT,
    schoolMidDist FLOAT,
    schoolHighRating INT,
    schoolHighDist FLOAT,
    priceIncreased INT,
    priceDiff FLOAT,
    days INT,
    lotAreaSQFT FLOAT,
    avgSchoolRating FLOAT,
    avgSchoolDIST FLOAT)'''.format(table_name)
    
    # Execute and commit the changes
    cur.execute(create_script)
    
    # Insert Data into the DB table
    insert_script = '''INSERT INTO {} (
    zid,
    State,
    City,
    Address,
    Lat,
    Lng,
    Price,
    Image,
    Bedrooms,
    Bathrooms,
    lotArea,
    constructedArea,
    zipCode,
    CountyID,
    taxRate,
    hasGarage,
    hasPool,
    hasCooling,
    hasView,
    yearBuilt,
    stories,
    parkingSpaces,
    annualHOI,
    annualHOA,
    schoolElemRating,
    schoolElemDist,
    schoolMidRating,
    schoolMidDist,
    schoolHighRating,
    schoolHighDist,
    priceIncreased,
    priceDiff,
    days,
    lotAreaSQFT,
    avgSchoolRating,
    avgSchoolDIST)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''.format(table_name)
    
    with open(input_file, 'r') as csvfile:
        dataReader = csv.reader(csvfile)
        next(dataReader)
        for row in dataReader:
            zid = int(row[0])
            State = State
            City = City
            Address = row[1]
            Lat = float(row[2])
            Lng = float(row[3])
            Price = int(row[4])
            Image = row[5]
            Bedrooms = math.ceil(float(row[6]))
            Bathrooms = int(row[7])
            lotArea = float(row[8])
            constructedArea = float(row[9])
            zipCode = int(row[10])
            CountyID = int(row[11])
            taxRate = float(row[12])
            hasGarage = int(row[13])
            hasPool = int(row[14])
            hasCooling = int(row[15])
            hasView = int(row[16])
            yearBuilt = math.floor(float(row[17]))
            stories = math.floor(float(row[18]))
            parkingSpaces = int(row[19])
            annualHOI = int(row[20])
            annualHOA = row[21]
            schoolElemRating = math.floor(float(row[22]))
            schoolElemDist = float(row[23])
            schoolMidRating = int(row[24])
            schoolMidDist = float(row[25])
            schoolHighRating = int(row[26])
            schoolHighDist = float(row[27])
            priceIncreased = int(row[28])
            priceDiff = int(row[29])
            Days = int(row[30])
            lotAreaSQFT = float(row[31])
            avgSchoolRating = float(row[32])
            avgSchoolDist = float(row[33])
        
            values = [zid, State, City, Address, Lat, Lng, Price, Image, Bedrooms, Bathrooms, lotArea, constructedArea, zipCode, CountyID, taxRate, hasGarage, hasPool, hasCooling, hasView, yearBuilt, stories, parkingSpaces, annualHOI, annualHOA, schoolElemRating, schoolElemDist, schoolMidRating, schoolMidDist, schoolHighRating, schoolHighDist, priceIncreased, priceDiff, Days, lotAreaSQFT, avgSchoolRating, avgSchoolDist]
        
            cur.execute(insert_script, values)
    
    conn.commit()

except Exception as error:
    print(error)
finally:
    if cur is not None:
        cur.close()
    if conn is not None:
        conn.close()
        print(f'Table {table_name} was succesfully saved to the db {database}')

Table data_houston_tx_houses_merged_2023_1_8_12_2 was succesfully saved to the db real_estate_data_test


#### Read the table from the DB

In [6]:
with psycopg2.connect("host='{}' port={} dbname='{}' user={} password={}".format(hostname, port_id, database, username, pwd)) as conn:
    sql = "select * from {};".format(table_name)
    db2df = pd.read_sql_query(sql, conn)

db2df.head(3)

Unnamed: 0,zid,state,city,address,lat,lng,price,image,bedrooms,bathrooms,lotarea,constructedarea,zipcode,countyid,taxrate,hasgarage,haspool,hascooling,hasview,yearbuilt,stories,parkingspaces,annualhoi,annualhoa,schoolelemrating,schoolelemdist,schoolmidrating,schoolmiddist,schoolhighrating,schoolhighdist,priceincreased,pricediff,days,lotareasqft,avgschoolrating,avgschooldist
0,27541924,TX,Houston,"6602 Indian Lake Dr, Missouri City, TX 77489",29.6036,-95.485405,276000,https://photos.zillowstatic.com/fp/bc5a7196226...,3,2,8764.272,3612.0,77489,2698,2.41,1,0,1,0,1971,2,2,1159,$400 annually,6,0.5,3,1.3,2,1.1,1,31000.0,1213,8764.272,3.666667,0.966667
1,27542111,TX,Houston,"6727 Castleview Ln, Missouri City, TX 77489",29.600388,-95.48821,220000,https://photos.zillowstatic.com/fp/1336380a837...,3,2,7884.36,1270.0,77489,2698,2.41,1,0,1,0,1972,1,1,924,$144 annually,6,0.3,3,1.3,2,1.0,1,65100.0,503,7884.36,3.666667,0.866667
2,27542455,TX,Houston,"15902 Ridgerock Rd, Missouri City, TX 77489",29.595436,-95.48588,230000,https://photos.zillowstatic.com/fp/5603ce32232...,4,2,7148.196,1765.0,77489,2698,2.41,1,0,1,0,1974,1,2,966,$250 annually,6,0.3,3,1.0,2,0.6,1,115275.0,5140,7148.196,3.666667,0.633333
