# <center>Predicting The Housing Market</center>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')

from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn import svm, preprocessing, metrics
#svm: support vector machine
#preprocessing: converts your data to a range of -1 to +1, generally it helps improve ML classification accuracy 
#cross_validation: use to amke test and training sets

#predicting category (up or down), labeled data, less than 100k samples, do linear SVC (if that doesn't work) 

### Display Correlation of all Economic Indicators
- HPI % change of all 50 states
- HPI % change of entire US
- 30 Year Mortgage Rate
- SP500
- GDP
- Unemployment Rate

In [2]:
housing_data = pd.read_pickle('HPI.pickle')
print(housing_data.corr())
print(housing_data)

                         AL        AK        AZ        AR        CA        CO  \
AL                 1.000000  0.953055  0.943167  0.995648  0.945827  0.953546   
AK                 0.953055  1.000000  0.921707  0.972279  0.930804  0.955068   
AZ                 0.943167  0.921707  1.000000  0.943109  0.980229  0.927803   
AR                 0.995648  0.972279  0.943109  1.000000  0.949950  0.965465   
CA                 0.945827  0.930804  0.980229  0.949950  1.000000  0.946625   
CO                 0.953546  0.955068  0.927803  0.965465  0.946625  1.000000   
CT                 0.949618  0.883130  0.908444  0.940044  0.926301  0.858594   
DE                 0.983000  0.940160  0.946743  0.977479  0.960965  0.917398   
FL                 0.935939  0.913918  0.995166  0.935899  0.988249  0.922831   
GA                 0.978678  0.913759  0.956739  0.971133  0.959304  0.958139   
HI                 0.948447  0.952775  0.930594  0.955454  0.959957  0.938196   
ID                 0.978576 

### Create Labels
- The labels are assigned to the feature "sets" (current HPI, GDP, etc)
- Label is either a "1" for "HPI Increased" or a "0" for "HPI did not Increase"

In [3]:
def create_labels(cur_hpi, fut_hpi):
    if fut_hpi > cur_hpi:
        return 1
    else:
        return 0

### Normalize data

In [4]:
# make percent change from the previous month for whole dataframe
housing_data = housing_data.pct_change()         ## normalizes data

# clean dataframe by removing any nan, -inf, or +inf values
housing_data.replace([np.inf, -np.inf], np.nan, inplace=True)
housing_data['US_HPI_future'] = housing_data['United States'].shift(-1)

housing_data.dropna(inplace=True)
housing_data['label'] = list(map(create_labels, housing_data['United States'], housing_data['US_HPI_future']))


# X is our feature
X = np.array(housing_data.drop(['label', 'US_HPI_future'], 1))
print(X)

# converts data to a hopeful range from a -1 to +1
X = preprocessing.scale(X)
# classification (label)
y = np.array(housing_data['label'])

# throw in data into train_test_split() so we train on 80% of the data and test on 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

[[ 0.3516155   0.37702811  0.13268818 ...  0.28571429  0.
   0.44100323]
 [ 0.09695445  0.29402538 -0.04647298 ... -0.22222222  0.
   0.10060654]
 [-0.10972323  0.22933377 -0.22997208 ... -0.28571429  1.62204438
  -0.05259375]
 ...
 [ 0.0175453   0.00783857  0.01177506 ...  0.02439024  0.02012908
  -0.02532518]
 [ 0.01644411  0.00777907  0.01013488 ...  0.02380952  0.
   0.0159117 ]
 [ 0.01225673  0.00549528  0.00970873 ... -0.04651163  0.
   0.02062149]]


### Specify and Train Classifier

In [5]:
# classifier is SVC
clf = svm.SVC(kernel='linear')

# now we train it
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

### Call Predict
Returns an array of predicted labels for test data set


In [6]:
y_predict = clf.predict(X_test)

### Test Classifier for Accuracy Percentage

In [7]:
accuracy = metrics.accuracy_score(y_test, y_predict)
print(accuracy)

0.7596153846153846
