In [4]:
import numpy as np
from scipy.linalg import svd
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import GridSearchCV

In [5]:
# To start, I need to have a look at the data itself. I do this by adding the column names to a dataframe.
column_names = ['state', 'county', 'community', 'communityname', 'fold', 'population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'LandArea', 'PopDens', 'PctUsePubTrans', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'LemasPctOfficDrugUn', 'PolicBudgPerPop', 'ViolentCrimesPerPop']
data = pd.read_csv('data/communities.data', sep=',')
data.columns = column_names
data = data.replace('?', np.nan) #Replace '?' for NaN values so I can use built-in pandas functions.

In [6]:
data.head(10)

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,53,,,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.0,,0.67
1,24,,,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.0,,0.43
2,34,5.0,81440.0,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,,,,,0.0,,0.12
3,42,95.0,6096.0,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.0,,0.03
4,6,,,SouthPasadenacity,1,0.02,0.28,0.06,0.54,1.0,...,0.01,0.58,0.1,,,,,0.0,,0.14
5,44,7.0,41500.0,Lincolntown,1,0.01,0.39,0.0,0.98,0.06,...,0.05,0.08,0.06,,,,,0.0,,0.03
6,6,,,Selmacity,1,0.01,0.74,0.03,0.46,0.2,...,0.01,0.33,0.0,,,,,0.0,,0.55
7,21,,,Hendersoncity,1,0.03,0.34,0.2,0.84,0.02,...,0.04,0.17,0.04,,,,,0.0,,0.53
8,29,,,Claytoncity,1,0.01,0.4,0.06,0.87,0.3,...,0.0,0.47,0.11,,,,,0.0,,0.15
9,6,,,DalyCitycity,1,0.13,0.71,0.15,0.07,1.0,...,0.02,1.0,1.0,,,,,0.0,,0.24


In [7]:
#Looking at the data to get a sense of what cleaning is needed.
#To start, the dataset includes a data description that notes which columns are predictive, so I will drop those columns.
dropped = ['state', 'county', 'community', 'communityname', 'fold']
data2 = data.drop(axis = 1, columns = dropped)
data2.info(verbose = True, show_counts = True)
data.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1993 entries, 0 to 1992
Data columns (total 123 columns):
 #    Column                 Non-Null Count  Dtype  
---   ------                 --------------  -----  
 0    population             1993 non-null   float64
 1    householdsize          1993 non-null   float64
 2    racepctblack           1993 non-null   float64
 3    racePctWhite           1993 non-null   float64
 4    racePctAsian           1993 non-null   float64
 5    racePctHisp            1993 non-null   float64
 6    agePct12t21            1993 non-null   float64
 7    agePct12t29            1993 non-null   float64
 8    agePct16t24            1993 non-null   float64
 9    agePct65up             1993 non-null   float64
 10   numbUrban              1993 non-null   float64
 11   pctUrban               1993 non-null   float64
 12   medIncome              1993 non-null   float64
 13   pctWWage               1993 non-null   float64
 14   pctWFarmSelf           1993 non-null  

In [8]:
# Since the columns missing data are missing more than 50% of the data from the other columns, 
#    I will drop them for now to ensure that I have enough data for a robust analysis.
dropped = ['OtherPerCap', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'PolicBudgPerPop']
data3 = data2.drop(axis = 1, columns = dropped)
data3.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1993 entries, 0 to 1992
Data columns (total 100 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   population             1993 non-null   float64
 1   householdsize          1993 non-null   float64
 2   racepctblack           1993 non-null   float64
 3   racePctWhite           1993 non-null   float64
 4   racePctAsian           1993 non-null   float64
 5   racePctHisp            1993 non-null   float64
 6   agePct12t21            1993 non-null   float64
 7   agePct12t29            1993 non-null   float64
 8   agePct16t24            1993 non-null   float64
 9   agePct65up             1993 non-null   float64
 10  numbUrban              1993 non-null   float64
 11  pctUrban               1993 non-null   float64
 12  medIncome              1993 non-null   float64
 13  pctWWage               1993 non-null   float64
 14  pctWFarmSelf           1993 non-null   float64
 15  pct

In [9]:
# I want to know initially if there are any features that are highly correlated to my target feature
corr_matrix = data3.corr()
corr_matrix['ViolentCrimesPerPop'].sort_values().drop(columns = ['ViolentCrimesPerPop'], axis = 0)

PctKids2Par           -0.738445
PctFam2Par            -0.706713
racePctWhite          -0.684787
PctYoungKids2Par      -0.666094
PctTeen2Par           -0.661598
                         ...   
FemalePctDiv           0.556472
pctWPubAsst            0.574690
racepctblack           0.631279
PctIlleg               0.737965
ViolentCrimesPerPop    1.000000
Name: ViolentCrimesPerPop, Length: 100, dtype: float64

In [10]:
# Now all of my columns are numeric, and they have enough data for analysis.
# Nest I need to separate the target value from the other columns and start my analysis.

In [11]:
X = data3.drop(['ViolentCrimesPerPop'], axis = 1)
y = data3['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [12]:
# To start, I want a baseline for my regression, so I fit a simple linear regression with standard settings.
model = LinearRegression().fit(X_train, y_train)

In [14]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print('Train score: ' + str(train_score))
print('Test score:  ' + str(test_score))
# From this simple regression, I can see that accuracy is decent for real world data that is hard to understand.
# Training data is significantly lower than train data, so overfitting is an issue.

Train score: 0.691832128893537
Test score:  0.6765794475472465


In [15]:
#Performing a Lasso regression and analyzing the results.
lasso_pipe = Pipeline([('poly_features', PolynomialFeatures()), ('lasso', Lasso())])
parameters = {'poly_features__degree':[1, 2, 3]}
lasso_grid = GridSearchCV(lasso_pipe, param_grid = parameters).fit(X_train, y_train)

In [16]:
#Get the best score across the parameters.
lasso_best_score = lasso_grid.best_score_

In [17]:
#Performing a Ridge regression and analyzing the results.
ridge_pipe = Pipeline([('poly_features', PolynomialFeatures()), ('ridge', Ridge())])
parameters = {'poly_features__degree':[1, 2, 3]}
ridge_grid = GridSearchCV(ridge_pipe, param_grid = parameters).fit(X_train, y_train)

In [18]:
#Get the best score across the parameters.
ridge_best_score = lasso_grid.best_score_

In [19]:
print("Lasso best score: " + str(lasso_best_score))
print("Ridge best score: " + str(ridge_best_score))

Lasso best score: -0.003449826517250676
Ridge best score: -0.003449826517250676
