In [1]:
import reader
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

ImportError: No module named 'pandas'

#### Loading the dataset

In [None]:
data=np.power(np.log1p(reader.get_all_data()), 1/2)
data.head()

### Data Visualization

#### Plotting histograms to see the data distribution of each feature

In [None]:
data.hist(figsize=(15,15), xrot=-45, bins=10)
plt.show()

#### Finding correlation between all the features

In [None]:
corrmat = data.corr() 
corrmat

#### Plotting the correlation matrix using heatmap

In [None]:
corrmat = data.corr() 
  
f, ax = plt.subplots(figsize =(9, 8)) 
sns.heatmap(corrmat, ax = ax, cmap ="YlGnBu", linewidths = 0.1) 

#### Plotting the correlation between features using scatter plot matrix

In [None]:
pd.plotting.scatter_matrix(data, figsize=(22, 22))
plt.show()

#### Plotting correlations between similar types of features

In [None]:
plt.scatter(data['Property Crime'],data['Violent Crime'])  
plt.xlabel('Property Crime')
plt.ylabel('Violent Crime')
plt.title('Correlation between Property Crime and Violent Crime')
plt.show()

In [None]:
plt.scatter(data['Percent High School Dropouts'],data['Percent No Degree'])  
plt.xlabel('Percentage of High School dropouts')
plt.ylabel('Percentage of people without degree')
plt.title('Correlation between percentage of people who dropped out of high school and people without degree')
plt.show()

In [None]:
plt.scatter(data['Mean Income (Household)'],data['Median Income (Household)'])  
plt.xlabel('Mean Income per household')
plt.ylabel('Median Income per household')
plt.title('Correlation between mean and median income per household')
plt.show()

In [None]:
plt.scatter(data['Mean Income (Household)'],data['Income Standard Deviation (Household)'])  
plt.xlabel('Mean Income per household')
plt.ylabel('Standard Deviation in Income per household')
plt.title('Correlation between mean and standarad deviation income per household')
plt.show()

#### Plotting correlation between Per Capita Income, Education Level and Total Crime 

In [None]:
from mpl_toolkits import mplot3d
ax = plt.axes(projection='3d')

# Data for a three-dimensional line
zline = np.linspace(0, 115, 10000)
xline = np.sin(zline)
yline = np.cos(zline)
ax.plot3D(xline, yline, zline, 'gray')

# Data for three-dimensional scattered points
zdata = data['Percent High School Dropouts']
xdata = data['Total Crime']
ydata = data['Per Capita Income']
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='viridis', linewidth=10);
ax.set_xlabel('Total Crime')
ax.set_ylabel('Per Capita Income')
ax.set_zlabel('Percent No Degree');

In [None]:
ax = plt.axes(projection='3d')

# Data for a three-dimensional line
zline = np.linspace(0, 115, 10000)
xline = np.sin(zline)
yline = np.cos(zline)
ax.plot3D(xline, yline, zline, 'gray')

# Data for three-dimensional scattered points
zdata = data['Percent No Degree']
xdata = data['Total Crime']
ydata = data['Per Capita Income']
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='viridis', linewidth=10);
ax.set_xlabel('Total Crime')
ax.set_ylabel('Per Capita Income')
ax.set_zlabel('Percent No Degree');

#### Getting features and label and performing few transformations

In [None]:
X = reader.get_features('dropout', 'mean')
y = reader.get_label('violent')
y = np.power(np.log1p(y),1/2)

In [None]:
from collections import OrderedDict
class Row(object):
    
    def __init__(self):
        self.education_type = None
        self.income_type = None
        self.LinearRegMSE = None
        self.RidgeMSE = None
        self.DicisionTreeMSE = None
        self.KNeighbourMSE = None
        self.SVR_MSE = None
        self.RandomForestMSE = None
        self.BoostingMSE = None

    def toDict(self):
        return {'education_type' : self.education_type,
                'income_type' : self.income_type,
                'LinearRegMSE':  self.LinearRegMSE,
                'RidgeMSE' : self.RidgeMSE,
                'DicisionTreeMSE' : self.DicisionTreeMSE,
                'KNeighbourMSE': self.KNeighbourMSE,
                'SVR_MSE' : self.SVR_MSE,
                'RandomForestMSE': self.RandomForestMSE,
                'BoostingMSE' :self.BoostingMSE
               }

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

import warnings
warnings.filterwarnings("ignore")

param_grid = {"education_type" :["dropout", "degreeless"], "income_type" :["mean", "median", "percapita", "deviation"]}
result = pd.DataFrame()

for param in list(ParameterGrid(param_grid)):
    row = Row()
    row.education_type = param['education_type']
    row.income_type = param['income_type']
    X = reader.get_features(param['education_type'], param['income_type'])
    y = reader.get_label('violent')
    y = np.power(np.log1p(y),1/2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    
    ## Scale input data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.to_numpy())
    X_test = scaler.fit_transform(X_test.to_numpy())
    
    ## Liner Regression
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred=linreg.predict(X_test)
    row.LinearRegMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ## Ridge Regression
    ridgereg = Ridge(alpha=1.0)
    ridgereg=ridgereg.fit(X_train,y_train)
    y_pred=ridgereg.predict(X_test)
    row.RidgeMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ## Decision Tree
    regr = DecisionTreeRegressor(max_depth=2)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    row.DicisionTreeMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ## Random Forest Tree
    regr = RandomForestRegressor(max_depth=2)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    row.RandomForestMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ### Boosting
    params = {'n_estimators': 100, 'max_depth': 2}
    clf = GradientBoostingRegressor(**params)
    clf.fit(X_train, y_train)
    row.BoostingMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ### KNN
    neigh = KNeighborsRegressor(n_neighbors=3)
    neigh.fit(X_train, y_train) 
    y_pred=neigh.predict(X_test)
    row.KNeighbourMSE = metrics.mean_squared_error(y_test, y_pred)
    
    ### SVR
    svr = SVR(gamma='auto')
    svr = svr.fit(X_train, y_train.values.ravel())
    y_pred=svr.predict(X_test)
    row.SVR_MSE = metrics.mean_squared_error(y_test, y_pred)
    
    result = result.append(row.toDict(), ignore_index=True)

result