In [5]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

### Getting column names

In [6]:
# getting column names
communities_names = urllib.request.urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names")
col_names = []

for line in communities_names:
    line = line.decode('utf-8')
    if line.startswith('@attribute'):
        col_names.append(line.split(" ")[1])

In [7]:
#reading data
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data"
data = pd.read_csv(url, header=None, names=col_names)
data

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.06,0.04,0.9,0.5,0.32,0.14,0.20
1,53,?,?,Tukwilacity,1,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.00,?,0.67
2,24,?,?,Aberdeentown,1,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.00,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.00,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.00,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.00,?,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,12,?,?,TempleTerracecity,10,0.01,0.40,0.10,0.87,0.12,...,0.01,0.28,0.05,?,?,?,?,0.00,?,0.09
1990,6,?,?,Seasidecity,10,0.05,0.96,0.46,0.28,0.83,...,0.02,0.37,0.20,?,?,?,?,0.00,?,0.45
1991,9,9,80070,Waterburytown,10,0.16,0.37,0.25,0.69,0.04,...,0.08,0.32,0.18,0.08,0.06,0.78,0,0.91,0.28,0.23
1992,25,17,72600,Walthamcity,10,0.08,0.51,0.06,0.87,0.22,...,0.03,0.38,0.33,0.02,0.02,0.79,0,0.22,0.18,0.19


### dropping unnecessary non-predictive features

In [8]:
data = data.drop(columns=['state', 'county', 'communityname', 'fold'], axis=1)

### dropping columns with too many NaN values

In [9]:
#replacing ? to NaN and deleting column which have more than 20% values equal to NaN
data = data.replace('?', np.NaN)
nan_values_data = pd.DataFrame({'amount': data.isnull().sum()})
nan_values_data = nan_values_data[nan_values_data.amount > 0]

nan_values_data

Unnamed: 0,amount
community,1177
OtherPerCap,1
LemasSwornFT,1675
LemasSwFTPerPop,1675
LemasSwFTFieldOps,1675
LemasSwFTFieldPerPop,1675
LemasTotalReq,1675
LemasTotReqPerPop,1675
PolicReqPerOffic,1675
PolicPerPop,1675


In [10]:
#deleting columns which have NaN values
data = data.dropna(axis=1)
data

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesPerPop
0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.42,0.50,0.51,0.64,0.12,0.26,0.20,0.32,0.20
1,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.21,0.50,0.34,0.60,0.52,0.02,0.12,0.45,0.00,0.67
2,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.00,0.43
3,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,0.21,...,0.19,0.30,0.73,0.64,0.65,0.02,0.39,0.28,0.00,0.12
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.00,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,0.35,0.30,...,0.22,0.28,0.34,0.48,0.39,0.01,0.28,0.05,0.00,0.09
1990,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,0.73,0.14,...,0.53,0.25,0.17,0.10,0.00,0.02,0.37,0.20,0.00,0.45
1991,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,0.31,0.54,...,0.25,0.68,0.61,0.79,0.76,0.08,0.32,0.18,0.91,0.23
1992,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,0.63,0.41,...,0.45,0.64,0.54,0.59,0.52,0.03,0.38,0.33,0.22,0.19


## Linear regression

In [11]:
y = data["ViolentCrimesPerPop"]
X = data.drop(columns=['ViolentCrimesPerPop'], axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=39)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [23]:
y_predicted = model.predict(X_test)
rmse_linear_reg = np.sqrt(mean_squared_error(y_test, y_predicted))
print("RMSE: ", rmse_linear_reg)

RMSE:  0.13607303089495876


## Ridge Regression

In [16]:
#default value for alpha = 1.0
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

Ridge()

In [17]:
y_predicted_ridge = ridge_model.predict(X_test)
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_predicted_ridge)))

RMSE:  0.13501705847229364


Above we have created a Ridge model with a default alpha value 1.0, let's find the optimal value of the alpha parameter using RidgeCV.

In [18]:
ridge_cv_model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1.0, 10.0, 50.0, 100.0, 200.0], scoring = "neg_mean_squared_error")
ridge_cv_model.fit(X_train, y_train)

RidgeCV(alphas=array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 5.e+01, 1.e+02, 2.e+02]),
        scoring='neg_mean_squared_error')

In [19]:
print("The optimal value of alpha is: ", ridge_cv_model.alpha_)

The optimal value of alpha is:  50.0


In [24]:
ridge_model = Ridge(alpha = ridge_cv_model.alpha_).fit(X_train, y_train)
y_predicted_ridge = ridge_model.predict(X_test)
rmse_ridge_reg = np.sqrt(mean_squared_error(y_test, y_predicted_ridge))
print("RMSE: ", rmse_ridge_reg)

RMSE:  0.13326160334076625


In [25]:
print(f"As a result: \n RMSE in Linear Regression is {rmse_linear_reg}\n RMSE in Ridge Regression is {rmse_ridge_reg}")

As a result: 
 RMSE in Linear Regression is 0.13607303089495876
 RMSE in Ridge Regression is 0.13326160334076625
