In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

---

## Data merging and cleaning

In [2]:
# Read the CSV housing_data file from the Resources folder into a Pandas DataFrame
housing_data = Path("Resources/housing_data.csv")
income_data = Path("Resources/income_data.csv")
merged_crime_data = Path("Resources/merged_crime_data.csv")
zipcode_data = Path("Resources/us_city_zipcode_data.csv")
housing_df = pd.read_csv(housing_data)
income_df = pd.read_csv(income_data)
crime_df = pd.read_csv(merged_crime_data)
zipcode_df = pd.read_csv(zipcode_data)


In [3]:
# View the columns of four dataframes to check if 'zipcode' exists for the merging
print(housing_df.columns)
print(income_df.columns)
print(crime_df.columns)
print(zipcode_df.columns)

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')
Index(['state', 'zipcode', 'total_pop', 'total_income', 'country',
       'avg_income'],
      dtype='object')
Index(['states', 'cities', 'population', 'violent_crime', 'robbery',
       'prop_crime', 'burglary', 'vehicle_theft', 'total_crime',
       'tot_violent_crime', 'tot_prop_crim', 'arson'],
      dtype='object')
Index(['country code', 'postal code', 'place name', 'admin name1',
       'admin code1', 'admin name2', 'admin code2', 'latitude', 'longitude'],
      dtype='object')


In [4]:
# Drop unecessary columns in zipcode_df for the merge to main data
zipcode_df_drop = zipcode_df.drop(columns=['country code',
       'admin code1', 'admin name1', 'admin name2', 'admin code2', 'latitude', 'longitude'])

zipcode_df_drop.head()

Unnamed: 0,postal code,place name
0,99547,Atka
1,99660,Saint Paul Island
2,99509,Anchorage
3,99523,Anchorage
4,99524,Anchorage


In [5]:
# Rename colums in zipcode
zipcode_renamed_df = zipcode_df_drop.rename(columns= {
    "postal code": "zipcode",
    "place name": "city"
}
)
zipcode_renamed_df.head()

Unnamed: 0,zipcode,city
0,99547,Atka
1,99660,Saint Paul Island
2,99509,Anchorage
3,99523,Anchorage
4,99524,Anchorage


In [6]:
# Merge housing, income data
housing_income_df = housing_df.merge(income_df, how='left', on = 'zipcode')

# Merge the result with zipcode_renamed_df on 'zipcode'
housing_income_with_city_df = housing_income_df.merge(zipcode_renamed_df, on='zipcode', how='left')

# Show the result
housing_income_with_city_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,lat,long,sqft_living15,sqft_lot15,state,total_pop,total_income,country,avg_income,city
0,7229300521,20141013T000000,231300.0,2,1.0,1180,5650,1.0,0,0,...,47.5112,-122.257,1340,5650,WA,13220,899023,USA,68004.765507,Seattle
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,47.721,-122.319,1690,7639,WA,21760,1937898,USA,89057.8125,Seattle
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,47.7379,-122.233,2720,8062,WA,11700,1397727,USA,119463.846154,Kenmore
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,47.5208,-122.393,1360,5000,WA,8840,1260010,USA,142535.067873,Seattle
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,47.6168,-122.045,1800,7503,WA,12680,2992892,USA,236032.492114,Sammamish


In [7]:
# Crop unesscessary columns in crime data
crime_dropped_df = crime_df.drop(columns=['total_crime',
       'tot_violent_crime', 'tot_prop_crim', 'arson'])

# Rename columns
crime_renamed_df = crime_dropped_df.rename(columns= {
    "states": "state",
    "cities": "city"
}
)
crime_renamed_df.head()

Unnamed: 0,state,city,population,violent_crime,robbery,prop_crime,burglary,vehicle_theft
0,Pennsylvania,"Abington Township, Montgomery County",55731,197.4,70.0,1979.1,296.1,32.3
1,Oregon,Albany,51084,86.1,45.0,3092.9,438.5,184.0
2,Louisiana,Alexandria,48449,1682.2,293.1,7492.4,2010.4,379.8
3,California,Aliso Viejo,48999,87.8,12.2,847.0,208.2,26.5
4,Florida,Altamonte Springs,42296,335.7,82.8,3057.0,427.9,165.5


In [8]:
# Show columns of the two dats
print(housing_income_with_city_df.columns)
print(crime_renamed_df.columns)

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'state', 'total_pop',
       'total_income', 'country', 'avg_income', 'city'],
      dtype='object')
Index(['state', 'city', 'population', 'violent_crime', 'robbery', 'prop_crime',
       'burglary', 'vehicle_theft'],
      dtype='object')


In [9]:
# Drop unessary columns of the main data before merging
housing_income_with_city_drop_df = housing_income_with_city_df.drop(columns=['id', 'date', 'grade',
       'sqft_above', 'sqft_basement', 'yr_renovated',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 
       'total_income', 'state', 'country'])
housing_income_with_city_drop_df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,yr_built,zipcode,total_pop,avg_income,city
0,231300.0,2,1.0,1180,5650,1.0,0,0,3,1955,98178,13220,68004.765507,Seattle
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,1951,98125,21760,89057.8125,Seattle
2,180000.0,2,1.0,770,10000,1.0,0,0,3,1933,98028,11700,119463.846154,Kenmore
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,1965,98136,8840,142535.067873,Seattle
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,1987,98074,12680,236032.492114,Sammamish


In [10]:
# Check unique value of city in both data
housing_income_with_city_drop_df['city'].unique()

array(['Seattle', 'Kenmore', 'Sammamish', 'Redmond', 'Federal Way',
       'Maple Valley', 'Bellevue', 'Duvall', 'Auburn', 'Mercer Island',
       'Kent', 'Issaquah', 'Renton', 'Vashon', 'Kirkland',
       'Black Diamond', 'North Bend', 'Woodinville', 'Snoqualmie',
       'Enumclaw', 'Fall City', 'Bothell', 'Carnation', 'Medina'],
      dtype=object)

In [11]:
crime_renamed_df['city'].unique()

array(['Abington Township, Montgomery County', 'Albany', 'Alexandria',
       'Aliso Viejo', 'Altamonte Springs', 'Altoona', 'Ames', 'Anderson',
       'Ankeny', 'Apopka', 'Apple Valley', 'Arcadia', 'Arlington',
       'Attleboro', 'Auburn', 'Azusa', 'Barnstable', 'Bartlett',
       'Beavercreek', 'Bedford', 'Bell Gardens', 'Bellevue', 'Berwyn',
       'Beverly', 'Billerica', 'Biloxi', 'Binghamton', 'Blacksburg',
       'Blaine', 'Bloomfield Township', 'Bloomfield', 'Blue Springs',
       'Boardman', 'Bountiful', 'Bowie', 'Bowling Green', 'Bradenton',
       'Brea', 'Brentwood', 'Bridgewater Township', 'Bristol Township',
       'Brookline', 'Broomfield', 'Buckeye', 'Buffalo Grove',
       'Bullhead City', 'Burlington', 'Caldwell', 'Campbell',
       'Carol Stream', 'Casa Grande', 'Casper', 'Castle Rock',
       'Cathedral City', 'Cedar Hill', 'Cedar Park', 'Ceres', 'Cerritos',
       'Chapel Hill', 'Charlottesville', 'Chesterfield Township',
       'Chesterfield', 'Chicopee', 'Clevela

In [14]:
# Merge the main housing data with crime data
housing_merge_df = housing_income_with_city_drop_df.merge(crime_renamed_df, how="inner", on="city")
housing_merge_df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,yr_built,...,total_pop,avg_income,city,state,population,violent_crime,robbery,prop_crime,burglary,vehicle_theft
0,231300.0,2,1.0,1180,5650,1.0,0,0,3,1955,...,13220,68004.765507,Seattle,Washington,721365,,210.02,,1081.98,503.21
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,1951,...,21760,89057.8125,Seattle,Washington,721365,,210.02,,1081.98,503.21
2,604000.0,4,3.0,1960,5000,1.0,0,0,5,1965,...,8840,142535.067873,Seattle,Washington,721365,,210.02,,1081.98,503.21
3,510000.0,3,2.0,1680,8080,1.0,0,0,3,1987,...,12680,236032.492114,Sammamish,Washington,47158,33.9,2.1,727.3,231.1,27.6
4,1225000.0,4,4.5,5420,101930,1.0,0,0,3,2001,...,10030,207522.133599,Redmond,Washington,55770,64.6,19.7,2137.3,308.4,87.9


In [16]:
housing_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19950 entries, 0 to 19949
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          19950 non-null  float64
 1   bedrooms       19950 non-null  int64  
 2   bathrooms      19950 non-null  float64
 3   sqft_living    19950 non-null  int64  
 4   sqft_lot       19950 non-null  int64  
 5   floors         19950 non-null  float64
 6   waterfront     19950 non-null  int64  
 7   view           19950 non-null  int64  
 8   condition      19950 non-null  int64  
 9   yr_built       19950 non-null  int64  
 10  zipcode        19950 non-null  int64  
 11  total_pop      19950 non-null  int64  
 12  avg_income     19950 non-null  float64
 13  city           19950 non-null  object 
 14  state          19950 non-null  object 
 15  population     19950 non-null  object 
 16  violent_crime  10973 non-null  object 
 17  robbery        19950 non-null  float64
 18  prop_c