In [None]:
import pandas as pd
import sqlite3 as sql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns

## Importing data

In [None]:
conn = sql.connect('../data/census.sqlite')
cur = conn.cursor()

In [None]:
#sqlite
# Execute tells the cursor to run the query provided

cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")

# fetchall() allows us to see everything returned by our query, which we store in our 'available_tables' variable

available_tables = (cur.fetchall())

In [None]:
#sqlite
print(available_tables)

In [None]:
B01001_pop_age_sex = pd.read_sql('SELECT * FROM B01001;', conn)
B03002_pop_race_eth = pd.read_sql('SELECT * FROM B03002;', conn)
B11001_hhold_type = pd.read_sql('SELECT * FROM B11001;', conn)
B11002_pop_hhold_type = pd.read_sql('SELECT * FROM B11002;', conn)
B15002_sex_edu_att = pd.read_sql('SELECT * FROM B15002;', conn)
B17001_pov_status = pd.read_sql('SELECT * FROM B17001;', conn)
B19001_hhold_inc = pd.read_sql('SELECT * FROM B19001;', conn)
B19013_med_hhold_inc = pd.read_sql('SELECT * FROM B19013;', conn)
B19301_per_cap_inc = pd.read_sql('SELECT * FROM B19301;', conn)
B25024_types_struct = pd.read_sql('SELECT * FROM B25024;', conn)

In [None]:
cur.close()
conn.close()

In [None]:
B01001_pop_age_sex.head()

In [None]:
B15002_sex_edu_att.head()

In [None]:
B19301_per_cap_inc.head()

In [None]:
B19013_med_hhold_inc.head()

In [None]:
#sqlalchemy
# This is looking for postgresql on your pc, gets username and password localhost and port to make the connection string
database_name = 'housing'    # Fill this in with your lahman database name on pgadmin

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

In [None]:
#sqlalchemy
engine = create_engine(connection_string)

In [None]:
#sqlalchemy
query = '''
SELECT *
FROM locations;
'''

result = engine.execute(query)

In [None]:
#sqlalchemy
locations = pd.read_sql(query, con = engine)

In [None]:
locations.head()

In [None]:
barnes = pd.read_csv('../data/barnes.csv')

In [None]:
barnes

In [None]:
barnes.shape

In [None]:
lihtc = pd.read_csv('../data/LIHTC.csv')

In [None]:
pd.set_option('display.max_columns', None)
lihtc.head()

In [None]:
lihtc.loc[lihtc.FIPS2010 == 10202]

In [None]:
lihtc.FIPS2010.nunique()

In [None]:
police = pd.read_csv('../data/police_incidents.csv')

In [None]:
police.head()

In [None]:
#sqlalchemy
query = '''
SELECT *
FROM sales;
'''

result = engine.execute(query)

In [None]:
sales = pd.read_sql(query, con = engine)

In [None]:
sales.head()

In [None]:
# apn and pin seem to correlate, owneraddress is not the lot address
sales.loc[sales.apn == '00100000100']

In [None]:
#sqlalchemy
query = '''
SELECT *
FROM details;
'''

result = engine.execute(query)

In [None]:
details = pd.read_sql(query, con = engine)

In [None]:
pd.set_option('display.max_columns', None)
details.head()

In [None]:
query = '''
SELECT *
FROM properties;
'''

result = engine.execute(query)

In [None]:
properties = pd.read_sql(query, con = engine)

In [None]:
properties.head()

In [None]:
query = '''
SELECT *
FROM assessment;
'''

result = engine.execute(query)

In [None]:
assessment = pd.read_sql(query, con = engine)

In [None]:
assessment.head()

In [None]:
query = '''
SELECT *
FROM properties;
'''

result = engine.execute(query)

In [None]:
properties = pd.read_sql(query, con = engine)

In [None]:
properties.head()

In [None]:
properties.shape

In [None]:
properties.tract.nunique()

In [None]:
# need to extract year, find date of actual sale (idk what owndate is) and use tables with all
# sales info, not just most recent (properties is most recent I think)
properties.groupby(['tract', 'owndate'])['saleprice'].mean()

In [None]:
query = '''
SELECT DISTINCT classdesc, ludesc
FROM properties INNER JOIN assessment USING (apn)
ORDER BY classdesc, ludesc;
'''

result = engine.execute(query)

In [None]:
property_types = pd.read_sql(query, con = engine)

In [None]:
property_types.shape

In [None]:
property_types.head()

In [None]:
property_types.ludesc.unique()

## df Lewis made for filtered sales 

In [None]:
sales_clean = pd.read_csv('../data/sales_cleansed_data.csv')

In [None]:
sales_clean.head()

## determine if properties tract is the same as lihtc FIPS2010

In [None]:
properties.head()

In [None]:
lihtc.head()

In [None]:
lihtc.dtypes

In [None]:
lihtc['PROJ_ZIP'] = lihtc['PROJ_ZIP'].astype(str)

In [None]:
prop = properties[['tract', 'propaddr', 'propstreet', 'propzip', 'propcity']]

In [None]:
prop = prop.rename(columns = {'propzip': 'zip'})

In [None]:
lihtc_loc = lihtc[['PROJECT', 'PROJ_ADD', 'PROJ_ZIP', 'FIPS2010']]

In [None]:
lihtc_loc = lihtc_loc.rename(columns = {'PROJ_ZIP': 'zip', 'FIPS2010': 'tract'})

In [None]:
prop

In [None]:
prop['tract'] = prop['tract'].str[-5:]

In [None]:
prop.dtypes

In [None]:
prop.shape

In [None]:
lihtc_loc

In [None]:
lihtc_loc.dtypes

In [None]:
lihtc_loc['tract'] = lihtc_loc['tract'].astype(str)

In [None]:
lihtc_loc.shape

In [None]:
lihtc_loc['zip'] = lihtc_loc['zip'].str[:-2]

In [None]:
prop_lihtc = pd.merge(prop, lihtc_loc, on = ['tract', 'zip'], how = 'left')

In [None]:
# this doesnt even make sense, there arent even enough rows in lihtc_loc to make the shape this big?!
prop_lihtc.shape

In [None]:
#may be easier to only use zip and tract for each df and see if they merge together smoothly
prop_lihtc

In [None]:
lihtc_loc2 = lihtc_loc[['zip', 'tract']]

In [None]:
lihtc_loc2.shape

In [None]:
# drop duplicates, so if merge is size of left df, all combined tracts are in the same zipcode, 
# meaning it is safe to assume LIHTC FIPS2010 is the same as housing tract
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html by default removes duplicate rows based on all columns
lihtc_loc2 = lihtc_loc2.drop_duplicates()

In [None]:
lihtc_loc2.shape

In [None]:
prop_2 = prop[['tract','zip']]

In [None]:
prop_2.shape

In [None]:
# drop duplicates, so if merge is size of left df, all combined tracts are in the same zipcode, 
# meaning it is safe to assume LIHTC FIPS2010 is the same as housing tract
prop_2 = prop_2.drop_duplicates()

In [None]:
prop_2.shape

In [None]:
#why are some tracts in mulitple zipcodes?
prop_2

In [None]:
prop_lihtc_2 = pd.merge(prop_2, lihtc_loc2, on = ['tract', 'zip'], how = 'left')

In [None]:
prop_lihtc_2.shape

In [None]:
# all merged tracts line up with the same zipcodes, so it is safe to say that 
# LIHTC FIPS2010 is the same as housing tract
prop_lihtc_2
#why are some tracts in mulitple zipcodes?

In [None]:
prop_lihtc_2.tract.nunique()

In [None]:
dup_pl = prop_lihtc_2[prop_lihtc_2.duplicated('tract', keep = False)]

In [None]:
pd.set_option('display.max_rows', None)
dup_pl

In [None]:
dup_pl.dtypes

In [None]:
dup_pl['tract'] = pd.to_numeric(dup_pl['tract'])

In [None]:
dup_pl.sort_values('tract')

In [None]:
locations.head()

In [None]:
locations.apn.nunique()

In [None]:
locations.dtypes

In [None]:
sales_clean.head()

In [None]:
sales_clean.apn.nunique()

In [None]:
sales_clean.dtypes

In [None]:
sales.head()