In [None]:
import pandas as pd

def load_crime_dataset():
  crime = pd.read_table('crimeDataset.txt' , sep=','  , na_values='?')
  columns_to_keep = [5,6] + list(range(11,26)) + list(range(32,103)) + [145]
  crime = crime.iloc[:,columns_to_keep].dropna()

  X_crime = crime.iloc[:,range(0,88)]
  y_crime = crime['ViolentCrimesPerPop']
  return (X_crime , y_crime)

(X_crime , y_crime) =load_crime_dataset()

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X_crime , y_crime , random_state=0)

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression


reg = LinearRegression()
linreg = reg.fit(X_train , y_train)


print("linear model coeff (w): \n " , linreg.coef_)
print("linear model intercept (b): " , linreg.intercept_)

print("\n\n\n")

print('R-squard score (training):  {:.3f}'.format(linreg.score(X_train,y_train)))
print('R-squard score (testing):  {:.3f}'.format(linreg.score(X_test,y_test)))

linear model coeff (w): 
  [ 1.61892346e-03 -9.43009110e+01  1.36067510e+01 -3.13380670e+01
 -8.15482726e-02 -1.69455128e+01 -2.42730375e-03  1.53013232e+00
 -1.39193248e-02 -7.72112833e+00  2.28112354e+01 -5.65708295e+00
  9.34751364e+00  2.06969566e-01 -7.43413626e+00  9.65856476e-03
  4.38030290e-03  4.79754625e-03 -4.46469212e+00 -1.60907140e+01
  8.82778012e+00 -5.06734503e-01 -1.42198055e+00  8.17551991e+00
 -3.87048268e+00 -3.54209213e+00  4.48758304e+00  9.30645715e+00
  1.73644996e+02  1.18220766e+01  1.51120836e+02 -3.29613007e+02
 -1.35343395e+02  6.95380108e-01 -2.38369008e+01  2.77038981e+00
  3.82248925e-01  4.38813358e+00 -1.06410851e+01 -4.92294176e-03
  4.14031827e+01 -1.16206866e-03  1.18568968e+00  1.75418465e+00
 -3.68283678e+00  1.59679443e+00 -8.42180230e+00 -3.79703897e+01
  4.74076990e+01 -2.50768374e+01 -2.88246410e-01 -3.65633234e+01
  1.89516080e+01 -4.53336736e+01  6.82698598e+02  1.04478671e+02
 -3.28575414e+02 -3.14364068e+01  2.74053494e+01  5.12336432e+0

In [None]:
#Ridge Regession Model
from sklearn.linear_model import Ridge

X_train , X_test , y_train , y_test = train_test_split(X_crime , y_crime , random_state=0)

linridge = Ridge(alpha = 20.0).fit(X_train , y_train)

print('Crime dataset fitted by Ridge Regression \n')
print('Ridge regression linear model intercept: {}'.format(linridge.intercept_))
print('Ridge regression linear model coeff: \n{}'.format(linridge.coef_))
print('\nR-squard score (training):  {:.3f}'.format(linridge.score(X_train,y_train)))
print('R-squard score (testing):  {:.3f}'.format(linridge.score(X_test,y_test)))


Crime dataset fitted by Ridge Regression 

Ridge regression linear model intercept: -3352.423035846185
Ridge regression linear model coeff: 
[ 1.95091438e-03  2.19322667e+01  9.56286607e+00 -3.59178973e+01
  6.36465325e+00 -1.96885471e+01 -2.80715856e-03  1.66254486e+00
 -6.61426604e-03 -6.95450680e+00  1.71944731e+01 -5.62819154e+00
  8.83525114e+00  6.79085746e-01 -7.33614221e+00  6.70389803e-03
  9.78505502e-04  5.01202169e-03 -4.89870524e+00 -1.79270062e+01
  9.17572382e+00 -1.24454193e+00  1.21845360e+00  1.03233089e+01
 -3.78037278e+00 -3.73428973e+00  4.74595305e+00  8.42696855e+00
  3.09250005e+01  1.18644167e+01 -2.05183675e+00 -3.82210450e+01
  1.85081589e+01  1.52510829e+00 -2.20086608e+01  2.46283912e+00
  3.29328703e-01  4.02228467e+00 -1.12903533e+01 -4.69567413e-03
  4.27046505e+01 -1.22507167e-03  1.40795790e+00  9.35041855e-01
 -3.00464253e+00  1.12390514e+00 -1.82487653e+01 -1.54653407e+01
  2.41917002e+01 -1.32497562e+01 -4.20113118e-01 -3.59710660e+01
  1.29786751e+

In [None]:
#Scaled Ridge regession model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linridge = Ridge(alpha = 20.0).fit(X_train_scaled , y_train)

print('#scaled Crime dataset fitted by Ridge Regression \n')
print('Ridge regression linear model intercept: {}'.format(linridge.intercept_))
print('Ridge regression linear model coeff: \n{}'.format(linridge.coef_))
print('\nR-squard score (training):  {:.3f}'.format(linridge.score(X_train_scaled,y_train)))
print('R-squard score (testing):  {:.3f}'.format(linridge.score(X_test_scaled,y_test)))
print('Number of non-zero feature: {}'.format(np.sum(linridge.coef_ !=0)))

#scaled Crime dataset fitted by Ridge Regression 

Ridge regression linear model intercept: 933.3906385044155
Ridge regression linear model coeff: 
[  88.68827454   16.48947987  -50.30285445  -82.90507574  -65.89507244
   -2.27674244   87.74108514  150.94862182   18.8802613   -31.05554992
  -43.13536109 -189.44266328   -4.52658099  107.97866804  -76.53358414
    2.86032762   34.95230077   90.13523036   52.46428263  -62.10898424
  115.01780357    2.66942023    6.94331369   -5.66646499 -101.55269144
  -36.9087526    -8.7053343    29.11999068  171.25963057   99.36919476
   75.06611841  123.63522539   95.24316483 -330.61044265 -442.30179004
 -284.49744001 -258.37150609   17.66431072 -101.70717151  110.64762887
  523.13611718   24.8208959     4.86533322  -30.46775619   -3.51753937
   50.57947231   10.84840601   18.27680946   44.11189865   58.33588176
   67.08698975  -57.93524659  116.1446052    53.81163718   49.01607711
   -7.62262031   55.14288543  -52.08878272  123.39291017   77.12562171


In [None]:
print('Ridge regression: effect of lambda regularization parameter \n')
for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
  # The regularization parameter lamda is called alpha in sklearn
  linridge = Ridge (alpha = this_alpha).fit(X_train_scaled, y_train)
  r2_train = linridge.score(X_train_scaled, y_train)
  r2_test = linridge.score(X_test_scaled, y_test)
  num_coeff_bigger = np.sum(abs (linridge.coef_) > 1.0)
  print ('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, \ r-squared training: {:.2f}, r-squared test: {:.2f}\n'.format(this_alpha, num_coeff_bigger, r2_train, r2_test))

Ridge regression: effect of lambda regularization parameter 

Alpha = 0.00
num abs(coeff) > 1.0: 87, \ r-squared training: 0.67, r-squared test: 0.49

Alpha = 1.00
num abs(coeff) > 1.0: 87, \ r-squared training: 0.66, r-squared test: 0.56

Alpha = 10.00
num abs(coeff) > 1.0: 87, \ r-squared training: 0.63, r-squared test: 0.59

Alpha = 20.00
num abs(coeff) > 1.0: 88, \ r-squared training: 0.61, r-squared test: 0.60

Alpha = 50.00
num abs(coeff) > 1.0: 86, \ r-squared training: 0.58, r-squared test: 0.58

Alpha = 100.00
num abs(coeff) > 1.0: 87, \ r-squared training: 0.55, r-squared test: 0.55

Alpha = 1000.00
num abs(coeff) > 1.0: 84, \ r-squared training: 0.31, r-squared test: 0.30



In [None]:
#Lasso Regression Model

from sklearn.linear_model import Lasso

linlasso = Lasso(alpha=2.0 , max_iter=10000).fit(X_train_scaled , y_train)

print('Crime dataset')
print('lasso regression linear model intercept: {}'.format(linlasso.intercept_))
print('lasso regression linear model coeff: \n{}'.format(linlasso.coef_))
print('\nNumber of non-zero feature: {}'.format(np.sum(linlasso.coef_ !=0)))
print('R-squard score (training):  {:.3f}'.format(linlasso.score(X_train_scaled,y_train)))
print('R-squard score (testing):  {:.3f}'.format(linlasso.score(X_test_scaled,y_test)))

Crime dataset
lasso regression linear model intercept: 1186.612061998579
lasso regression linear model coeff: 
[    0.             0.            -0.          -168.18346054
    -0.            -0.             0.           119.6938194
     0.            -0.             0.          -169.67564456
    -0.             0.            -0.             0.
     0.             0.            -0.            -0.
     0.            -0.             0.             0.
   -57.52991966    -0.            -0.             0.
   259.32889226    -0.             0.             0.
     0.            -0.         -1188.7396867     -0.
    -0.            -0.          -231.42347299     0.
  1488.36512229     0.            -0.            -0.
    -0.             0.             0.             0.
     0.             0.            -0.             0.
    20.14419415     0.             0.             0.
     0.             0.           339.04468804     0.
     0.           459.53799903    -0.             0.
   122.69221826   