In [5]:
import reader
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

#### Loading the dataset

In [None]:
data=reader.get_all_data()
data.head()

#### Getting features and label and performing few transformations

In [11]:
from collections import OrderedDict
class Row(object):
    
    def __init__(self):
        self.education_type = None
        self.income_type = None
        self.crime_regularization = None
        self.LinearRegMSE = None
        self.RidgeMSE = None
        self.DecisionTreeMSE = None
        self.KNeighbourMSE = None
        self.SVR_MSE = None
        self.RandomForestMSE = None
        self.BoostingMSE = None

    def toDict(self):
        return {'education_type' : self.education_type,
                'income_type' : self.income_type,
                'crime_regularization': self.crime_regularization,
                'LinearRegMSE':  self.LinearRegMSE,
                'RidgeMSE' : self.RidgeMSE,
                'DecisionTreeMSE' : self.DecisionTreeMSE,
                'KNeighbourMSE': self.KNeighbourMSE,
                'SVR_MSE' : self.SVR_MSE,
                'RandomForestMSE': self.RandomForestMSE,
                'BoostingMSE' :self.BoostingMSE
               }

#### Applying models

In [12]:
import warnings
warnings.filterwarnings("ignore")

param_grid = {"education_type" :["dropout", "degree"], "income_type" :["median", "deviation"], "crime_type":["log", "sqrt_log"]}
result = pd.DataFrame()

for param in list(ParameterGrid(param_grid)):
    row = Row()
    row.education_type = "High School Dropout Percent" if param['education_type'] == 'dropout' else "Percent with Any Degree"
    row.income_type = "Median Income" if param['income_type'] == "median" else "Income Standard Deviation"
    row.crime_regularization = "Square Root of Log" if param['crime_type'] == 'log' else "Fourth Root of Log"
    X = reader.get_features(param['education_type'], param['income_type'])
    y = reader.get_label('total', param['crime_type'])
    y=np.power(y,1/2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    
    ## Scale input data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.to_numpy())
    X_test = scaler.fit_transform(X_test.to_numpy())
    
    ## Liner Regression
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred=linreg.predict(X_test)
    row.LinearRegMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ## Ridge Regression
    ridgereg = Ridge(alpha=1.0)
    ridgereg=ridgereg.fit(X_train,y_train)
    y_pred=ridgereg.predict(X_test)
    row.RidgeMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ## Decision Tree
    regr = DecisionTreeRegressor(max_depth=2)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    row.DecisionTreeMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ## Random Forest Tree
    regr = RandomForestRegressor(max_depth=2)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    row.RandomForestMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ### Boosting
    params = {'n_estimators': 100, 'max_depth': 2}
    clf = GradientBoostingRegressor(**params)
    clf.fit(X_train, y_train)
    row.BoostingMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ### KNN
    neigh = KNeighborsRegressor(n_neighbors=3)
    neigh.fit(X_train, y_train) 
    y_pred=neigh.predict(X_test)
    row.KNeighbourMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ### SVR
    svr = SVR(gamma='auto')
    svr = svr.fit(X_train, y_train.values.ravel())
    y_pred=svr.predict(X_test)
    row.SVR_MSE = metrics.mean_squared_error(y_test, y_pred)
   
    
    result = result.append(row.toDict(), ignore_index=True)

result


Unnamed: 0,BoostingMSE,DecisionTreeMSE,KNeighbourMSE,LinearRegMSE,RandomForestMSE,RidgeMSE,SVR_MSE,crime_regularization,education_type,income_type
0,0.136757,0.142844,0.187286,0.135341,0.136757,0.135341,0.133981,Square Root of Log,High School Dropout Percent,Median Income
1,0.140936,0.150008,0.196002,0.145034,0.140936,0.145033,0.140423,Square Root of Log,High School Dropout Percent,Income Standard Deviation
2,0.138107,0.141787,0.188993,0.134803,0.138107,0.134802,0.142382,Square Root of Log,Percent with Any Degree,Median Income
3,0.137959,0.140847,0.189016,0.137201,0.137959,0.137202,0.136058,Square Root of Log,Percent with Any Degree,Income Standard Deviation
4,0.020918,0.022114,0.029812,0.020799,0.020918,0.020799,0.020618,Fourth Root of Log,High School Dropout Percent,Median Income
5,0.022135,0.021925,0.030625,0.022015,0.022135,0.022014,0.021241,Fourth Root of Log,High School Dropout Percent,Income Standard Deviation
6,0.02131,0.021721,0.029206,0.02092,0.02131,0.02092,0.021868,Fourth Root of Log,Percent with Any Degree,Median Income
7,0.021465,0.0217,0.030082,0.021203,0.021465,0.021203,0.021044,Fourth Root of Log,Percent with Any Degree,Income Standard Deviation


#### SVR Support Vectors

In [14]:
X = reader.get_features("dropout", "median")
y = reader.get_label('total', "sqrt_log")
y=np.power(y,1/2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.to_numpy())
X_test = scaler.fit_transform(X_test.to_numpy())


svr = SVR(gamma='auto')
svr = svr.fit(X_train, y_train.values.ravel())
y_pred=svr.predict(X_test)

print(metrics.mean_squared_error(y_test, y_pred))
supportVectors = len(svr.support_vectors_)
SV=svr.support_vectors_
print("Number of support vectors : ",supportVectors)
print(" ")
print("Support Vectors : \n", SV)

0.02061790664666147
Number of support vectors :  745
 
Support Vectors : 
 [[ 0.02521343 -0.20740018]
 [ 0.59036416 -1.07106894]
 [-1.57823747  0.33439075]
 ...
 [-0.85537026  0.08756263]
 [ 0.49836288 -1.34122952]
 [ 2.49610499 -0.26986506]]


#### Linear Regression

In [15]:
X = reader.get_features("dropout", "median")
y = reader.get_label('total', "sqrt_log")
y=np.power(y,1/2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.to_numpy())
X_test = scaler.fit_transform(X_test.to_numpy())

linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred=linreg.predict(X_test)
print(metrics.mean_squared_error(y_test, y_pred))

linreg.coef_

0.020799078988974547


array([[0.02535872, 0.04726192]])

#### Linear SVR

In [16]:
from sklearn.svm import LinearSVR

X = reader.get_features("dropout", "median")
y = reader.get_label('total', "sqrt_log")
y=np.power(y,1/2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.to_numpy())
X_test = scaler.fit_transform(X_test.to_numpy())


svr = LinearSVR(C=0.0005)
svr = svr.fit(X_train, y_train.values.ravel())
y_pred=svr.predict(X_test)

print(metrics.mean_squared_error(y_test, y_pred))
SV=svr.coef_
print("Weights : \n", SV)

0.23326286901169918
Number of support vectors :  2
 
Support Vectors : 
 [0.00183921 0.0076809 ]
