In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score

In [2]:
train_data = pd.read_csv("Standard Metropolitan Areas Data - train_data.csv")
train_data.head()

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,labor,income,region,crime_rate
0,1384,78.1,12.3,25627,69678,50.1,4083.9,72100,1,75.55
1,3719,43.9,9.4,13326,43292,53.9,3305.9,54542,2,56.03
2,3553,37.4,10.7,9724,33731,50.6,2066.3,33216,1,41.32
3,3916,29.9,8.8,6402,24167,52.2,1966.7,32906,2,67.38
4,2480,31.5,10.5,8502,16751,66.1,1514.5,26573,4,80.19


In [3]:
train_data.shape

(99, 10)

In [4]:
train_data.isnull().sum()

land_area         0
percent_city      0
percent_senior    0
physicians        0
hospital_beds     0
graduates         0
labor             0
income            0
region            0
crime_rate        0
dtype: int64

In [5]:
# creating one hot encoder object with categorical feature 0 
# indicating the first column 
# onehotencoder = OneHotEncoder() 
# data = onehotencoder.fit_transform(train_data).toarray()

encoded_columns = pd.get_dummies(train_data['region'])
train_data = train_data.join(encoded_columns).drop('region', axis=1)

In [6]:
train_data

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,labor,income,crime_rate,1,2,3,4
0,1384,78.1,12.3,25627,69678,50.1,4083.9,72100,75.55,1,0,0,0
1,3719,43.9,9.4,13326,43292,53.9,3305.9,54542,56.03,0,1,0,0
2,3553,37.4,10.7,9724,33731,50.6,2066.3,33216,41.32,1,0,0,0
3,3916,29.9,8.8,6402,24167,52.2,1966.7,32906,67.38,0,1,0,0
4,2480,31.5,10.5,8502,16751,66.1,1514.5,26573,80.19,0,0,0,1
5,2815,23.1,6.7,7340,16941,68.3,1541.9,25663,58.48,0,0,1,0
6,8360,46.3,8.2,4047,14347,53.6,1321.2,18350,72.25,0,0,1,0
7,6794,60.1,6.3,4562,14333,51.7,1272.7,18221,64.88,0,0,1,0
8,3049,19.5,12.1,4005,21149,53.4,967.5,15826,30.51,1,0,0,0
9,4647,31.5,9.2,3916,12815,65.1,1032.2,14542,55.30,0,1,0,0


In [7]:
# #Feature Selection
# y_df = train_data[['crime_rate']]
# # x_df = train_data[['graduates']]
# x_df = train_data.iloc[:,:-1]
# # y_train = y_train.values.reshape(1,-1)
# # x_train = x_train.reshape((1,-1))

In [8]:
x_df = train_data.drop("crime_rate",1)   #Feature Matrix
y_df = train_data["crime_rate"]

In [9]:
print("X Shape: {}".format(x_df.shape))
print("Y Shape: {}".format(y_df.shape))

X Shape: (99, 12)
Y Shape: (99,)


In [10]:
# x_train2 = x_train.values.reshape((-1,1))
y_df = y_df.values.reshape((-1,1))

In [11]:
print("X Shape: {}".format(x_df.shape))
print("Y Shape: {}".format(y_df.shape))

X Shape: (99, 12)
Y Shape: (99, 1)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(79, 12) (79, 1)
(20, 12) (20, 1)


In [13]:
test_data = pd.read_csv("Standard Metropolitan Areas Data - model_prediction.csv")
test_data.head()

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,labor,income,region,crime_rate
0,3650,34.6,11.1,2358,6224,62.9,539.3,7792,4,
1,4883,72.4,7.3,873,2236,64.9,185.2,2353,4,
2,1435,13.4,11.7,342,1076,44.7,156.8,2165,1,
3,2042,24.5,16.5,4071,10039,51.9,681.7,10166,3,
4,1630,41.9,10.7,724,4395,50.0,198.0,2445,3,


In [14]:
test_data.shape

(42, 10)

In [15]:
# plt.scatter(X_train, y_train)
# plt.ylabel("crime rate")
# plt.xlabel("no. of graduates")
# plt.show()

In [16]:
#Linear Regression
lin_reg = LinearRegression()
# lin_regr.fit(train_data[['graduates']], train_data[['crime_rate']])
linreg_model = lin_reg.fit(X_train, y_train)

In [17]:
print('Coefficients: \n', lin_reg.coef_)
print('intercept: \n', lin_reg.intercept_)

Coefficients: 
 [[ 1.29161810e-04 -1.30485363e-02  9.38978038e-02 -1.18497301e-03
  -2.55914750e-04  2.18310903e-01  1.04150181e-02  1.96421394e-04
  -1.48884079e+01 -2.55336059e+00  3.31369241e+00  1.41280761e+01]]
intercept: 
 [41.78151521]


In [18]:
#formula obtained for the trained model
def graph(formula, x_range):
    x = np.array(x_range)
    y = eval(formula)
    plt.plot(x, y)

In [19]:
#plotting the prediction line 
# graph('lin_regr.coef_*x + lin_regr.intercept_', range(20, 80))
print(lin_reg.score(X_train, y_train))

0.5492060297023915


In [20]:
print("Score:", linreg_model.score(X_test, y_test))

Score: -0.10509096498433768


In [21]:
# predictions = lin_reg.predict(X_test)
# predictions

In [22]:
sc_X = StandardScaler()
sc_y = StandardScaler()
scX_train = sc_X.fit_transform(X_train)
scy_train = sc_y.fit_transform(y_train)
scX_test = sc_X.fit_transform(X_test)
scy_test = sc_y.fit_transform(y_test)

In [23]:
#Support Vector Regression
svr = SVR(kernel = 'rbf')
svr_model = svr.fit(scX_train, scy_train)

  y = column_or_1d(y, warn=True)


In [24]:
print("Score:", svr_model.score(scX_test, scy_test))

Score: 0.45419824309295703
