In [68]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

---

## Data merging and cleaning

In [70]:
# Read the CSV housing_data file from the Resources folder into a Pandas DataFrame
housing_data = Path("Resources/housing_data.csv")
income_data = Path("Resources/income_data.csv")
merged_crime_data = Path("Resources/merged_crime_data.csv")
zipcode_data = Path("Resources/us_city_zipcode_data.csv")
housing_df = pd.read_csv(housing_data)
income_df = pd.read_csv(income_data)
crime_df = pd.read_csv(merged_crime_data)
zipcode_df = pd.read_csv(zipcode_data)


In [71]:
# View the columns of four dataframes to check if 'zipcode' exists for the merging
print(housing_df.columns)
print(income_df.columns)
print(crime_df.columns)
print(zipcode_df.columns)

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')
Index(['state', 'zipcode', 'total_pop', 'total_income', 'country',
       'avg_income'],
      dtype='object')
Index(['states', 'cities', 'population', 'violent_crime', 'robbery',
       'prop_crime', 'burglary', 'vehicle_theft', 'total_crime',
       'tot_violent_crime', 'tot_prop_crim', 'arson'],
      dtype='object')
Index(['country code', 'postal code', 'place name', 'admin name1',
       'admin code1', 'admin name2', 'admin code2', 'latitude', 'longitude'],
      dtype='object')


In [73]:
# Drop unecessary columns in zipcode_df for the merge to main data
zipcode_df_drop = zipcode_df.drop(columns=['country code',
       'admin code1', 'admin name1', 'admin name2', 'admin code2', 'latitude', 'longitude'])

zipcode_df_drop.head()

Unnamed: 0,postal code,place name
0,99547,Atka
1,99660,Saint Paul Island
2,99509,Anchorage
3,99523,Anchorage
4,99524,Anchorage


In [74]:
# Rename colums in zipcode
zipcode_renamed_df = zipcode_df_drop.rename(columns= {
    "postal code": "zipcode",
    "place name": "city"
}
)
zipcode_renamed_df.head()

Unnamed: 0,zipcode,city
0,99547,Atka
1,99660,Saint Paul Island
2,99509,Anchorage
3,99523,Anchorage
4,99524,Anchorage


In [75]:
# Merge housing, income data
housing_income_df = housing_df.merge(income_df, how='left', on = 'zipcode')

# Merge the result with zipcode_renamed_df on 'zipcode'
housing_income_with_city_df = housing_income_df.merge(zipcode_renamed_df, on='zipcode', how='left')

# Show the result
housing_income_with_city_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,state,total_pop,total_income,country,avg_income,city
0,7229300521,20141013T000000,231300.0,2,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,WA,13220,899023,USA,68004.765507,Seattle
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,WA,21760,1937898,USA,89057.8125,Seattle
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,WA,11700,1397727,USA,119463.846154,Kenmore
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,WA,8840,1260010,USA,142535.067873,Seattle
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,WA,12680,2992892,USA,236032.492114,Sammamish


In [76]:
# Crop unesscessary columns in crime data
crime_dropped_df = crime_df.drop(columns=['total_crime',
       'tot_violent_crime', 'tot_prop_crim', 'arson'])

# Rename columns
crime_renamed_df = crime_dropped_df.rename(columns= {
    "states": "state",
    "cities": "city"
}
)
crime_renamed_df.head()

Unnamed: 0,state,city,population,violent_crime,robbery,prop_crime,burglary,vehicle_theft
0,Pennsylvania,"Abington Township, Montgomery County",55731,197.4,70.0,1979.1,296.1,32.3
1,Oregon,Albany,51084,86.1,45.0,3092.9,438.5,184.0
2,Louisiana,Alexandria,48449,1682.2,293.1,7492.4,2010.4,379.8
3,California,Aliso Viejo,48999,87.8,12.2,847.0,208.2,26.5
4,Florida,Altamonte Springs,42296,335.7,82.8,3057.0,427.9,165.5


In [77]:
# Show columns of the two dats
print(housing_income_with_city_df.columns)
print(crime_renamed_df.columns)

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'state', 'total_pop',
       'total_income', 'country', 'avg_income', 'city'],
      dtype='object')
Index(['state', 'city', 'population', 'violent_crime', 'robbery', 'prop_crime',
       'burglary', 'vehicle_theft'],
      dtype='object')


In [78]:
# Drop unessary columns of the main data before merging
housing_income_with_city_drop_df = housing_income_with_city_df.drop(columns=['id', 'date', 'grade',
       'sqft_above', 'sqft_basement', 'yr_renovated',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 
       'total_income', 'state', 'country'])
housing_income_with_city_drop_df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,yr_built,zipcode,total_pop,avg_income,city
0,231300.0,2,1.0,1180,5650,1.0,0,0,3,1955,98178,13220,68004.765507,Seattle
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,1951,98125,21760,89057.8125,Seattle
2,180000.0,2,1.0,770,10000,1.0,0,0,3,1933,98028,11700,119463.846154,Kenmore
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,1965,98136,8840,142535.067873,Seattle
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,1987,98074,12680,236032.492114,Sammamish


In [79]:
# Check unique value of city in both data
housing_income_with_city_drop_df['city'].unique()

array(['Seattle', 'Kenmore', 'Sammamish', 'Redmond', 'Federal Way',
       'Maple Valley', 'Bellevue', 'Duvall', 'Auburn', 'Mercer Island',
       'Kent', 'Issaquah', 'Renton', 'Vashon', 'Kirkland',
       'Black Diamond', 'North Bend', 'Woodinville', 'Snoqualmie',
       'Enumclaw', 'Fall City', 'Bothell', 'Carnation', 'Medina'],
      dtype=object)

In [80]:
crime_renamed_df['city'].unique()

array(['Abington Township, Montgomery County', 'Albany', 'Alexandria',
       'Aliso Viejo', 'Altamonte Springs', 'Altoona', 'Ames', 'Anderson',
       'Ankeny', 'Apopka', 'Apple Valley', 'Arcadia', 'Arlington',
       'Attleboro', 'Auburn', 'Azusa', 'Barnstable', 'Bartlett',
       'Beavercreek', 'Bedford', 'Bell Gardens', 'Bellevue', 'Berwyn',
       'Beverly', 'Billerica', 'Biloxi', 'Binghamton', 'Blacksburg',
       'Blaine', 'Bloomfield Township', 'Bloomfield', 'Blue Springs',
       'Boardman', 'Bountiful', 'Bowie', 'Bowling Green', 'Bradenton',
       'Brea', 'Brentwood', 'Bridgewater Township', 'Bristol Township',
       'Brookline', 'Broomfield', 'Buckeye', 'Buffalo Grove',
       'Bullhead City', 'Burlington', 'Caldwell', 'Campbell',
       'Carol Stream', 'Casa Grande', 'Casper', 'Castle Rock',
       'Cathedral City', 'Cedar Hill', 'Cedar Park', 'Ceres', 'Cerritos',
       'Chapel Hill', 'Charlottesville', 'Chesterfield Township',
       'Chesterfield', 'Chicopee', 'Clevela

In [81]:
# Merge the main housing data with crime data
housing_merge_df = housing_income_with_city_drop_df.merge(crime_renamed_df, how="inner", on="city")
print(housing_merge_df.to_string())

           price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  condition  yr_built  zipcode  total_pop     avg_income         city        state population violent_crime  robbery prop_crime  burglary vehicle_theft
0       231300.0         2       1.00         1180      5650     1.0           0     0          3      1955    98178      13220   68004.765507      Seattle   Washington    721,365           NaN   210.02        NaN  1,081.98        503.21
1       538000.0         3       2.25         2570      7242     2.0           0     0          3      1951    98125      21760   89057.812500      Seattle   Washington    721,365           NaN   210.02        NaN  1,081.98        503.21
2       604000.0         4       3.00         1960      5000     1.0           0     0          5      1965    98136       8840  142535.067873      Seattle   Washington    721,365           NaN   210.02        NaN  1,081.98        503.21
3       510000.0         3       2.00         16

In [None]:
# Drop the non-beneficial columns.
drop_columns = ['id', 'date', 'condition', 'sqft_above', 'lat', 'long', 'sqft_basement', 'sqft_living15', 'sqft_lot15', 'waterfront', 'total_income', 'state', 'country']
housing_clean_df = df_merge.drop(columns=drop_columns)
housing_clean_df.head()

In [None]:
# Look at view value counts 
view_counts = housing_clean_df['view'].value_counts()
view_counts

In [None]:
# Look at price value counts 
price_counts = housing_clean_df['price'].value_counts()
price_counts

## Split the Data into Training and Testing Sets

### Create the labels set (`y`)  from the “price” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = housing_clean_df["view"]

# Separate the X variable, the features
X = housing_clean_df.drop("view", axis=1)
feature_names = X.columns
X.head()

In [None]:
# Review the y variable Series
print(y[:-5])
print(y[:5])

In [None]:
# Review the y variable Series
y.info()

In [None]:
# Review the X variable DataFrame
X.describe()

### Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Machine Learning Model 1: Create a Logistic Regression Model with the Original Data

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
unique_classes = sorted(list(set(y_test)))
cm_df = pd.DataFrame(
    cm, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Print the classification report for the model
print("** Logistic Regression Model")
print("* Confusion Matrix")
display(cm_df)
print(f"* Accuracy Score : {acc_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions))

---

## Machine Learning Model 2: Decision Tree Model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import tree
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Creating the decision tree classifier instance
model_tree = tree.DecisionTreeClassifier()

In [None]:
# Fitting the model
model_tree = model_tree.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions_tree = model_tree.predict(X_test_scaled)

In [None]:
# Generate a confusion matrix for the model
cm_tree = confusion_matrix(y_test, predictions_tree)
cm_tree_df = pd.DataFrame(
    cm_tree, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_tree_score = accuracy_score(y_test, predictions_tree)

In [None]:
# Print the classification report for the model
print("** Decision Tree Model")
print("* Confusion Matrix")
display(cm_tree_df)
print(f"* Accuracy Score : {acc_tree_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_tree))

---

## Machine Learning Model 3: K-Nearest Neighbors (KNN) Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the model with k = 3 neighbors
model_knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
# Train the model
model_knn.fit(X_train_scaled, y_train)

In [None]:
# Create predictions
predictions_knn = model_knn.predict(X_test_scaled)

# Review the predictions
predictions_knn

In [None]:
# Generate a confusion matrix for the model
cm_knn = confusion_matrix(predictions_knn, y_test)
cm_knn_df = pd.DataFrame(
    cm_knn, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_knn_score = accuracy_score(y_test, predictions_knn)

In [None]:
# Print the classification report for the model
print("** K-Nearest Neighbors (KNN) Model")
print("* Confusion Matrix")
display(cm_knn_df)
print(f"* Accuracy Score : {acc_knn_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_knn))

---

## Machine Learning Model 4: Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
model_rf = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
model_rf = model_rf.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions_rf = model_rf.predict(X_test_scaled)

In [None]:
# Generate a confusion matrix for the model
cm_rf = confusion_matrix(predictions_knn, y_test)
cm_rf_df = pd.DataFrame(
    cm_rf, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_rf_score = accuracy_score(y_test, predictions_rf)

In [None]:
# Print the classification report for the model
print("** Random Forest Model")
print("* Confusion Matrix")
display(cm_rf_df)
print(f"* Accuracy Score : {acc_rf_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_rf))