In [1]:
#Importing libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
#Making feature and target.
crime = pd.read_table('CommViolPredUnnormalizedData.txt', sep=',', na_values='?')
columns_to_keep = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145]  
crime = crime.iloc[:,columns_to_keep].dropna()
X_crime = crime.iloc[:,range(0,88)]
y_crime = crime['ViolentCrimesPerPop']

In [6]:
#Importing important modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [7]:
#Splitting into Train and test
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

### Ridge Regression

In [15]:
#Developing Ridge Regression Model
rid = Ridge(alpha=20.0)
#Fitting the data
rid.fit(X_train, y_train)
#Predicting the value
rid.predict(X_test)

array([ 3.90204891e+02,  1.47281014e+03,  5.61021084e+02,  1.03074513e+03,
        5.78884352e+02,  3.46080107e+01,  2.18086186e+02,  2.30657337e+02,
        3.73426843e+02,  6.26480404e+02,  3.12791034e+02,  1.65741598e+03,
        2.17954187e+02,  8.06845018e+02,  4.68547837e+02,  1.30241159e+02,
        9.59917713e+02,  9.21976129e+01,  9.22291624e+02,  2.45546791e+03,
        4.09847031e+02,  3.07642207e+03,  6.72940332e+02,  5.44614228e+02,
        4.06997361e+02,  1.19968336e+03, -6.34216107e+01,  2.27990280e+02,
        2.53994786e+02,  8.33207262e+02,  2.23430051e+02,  1.44001971e+03,
        3.54626094e+02,  1.38205676e+02,  7.73296001e+02,  8.65980492e+02,
        5.46675295e+02, -5.29741945e+01,  8.51057463e+02,  1.37898382e+03,
        8.36241479e+02,  2.36242201e+02,  4.39850700e+02,  7.25518764e+03,
        3.93887413e+02,  2.06599092e+02, -5.07160300e+01,  4.86741564e+02,
        1.81288560e+03,  1.35659023e+03,  2.03796319e+02,  1.88424115e+02,
        6.66083348e+02,  

In [16]:
print("R2 Score : ", rid.score(X_test, y_test))

R2 Score :  0.4940490145966786


The value of score obtained is similar to what could have been obtained in linear regression. The ridge regression is not been much effective due to lack of preprocessing.

Preprocessing is required for scaling, normalizing and standardization of data based on the requirement.

### Ridge Regression with Normalization

In [17]:
#Importing important module
from sklearn.preprocessing import MinMaxScaler

In [18]:
#Normalization object
scaler = MinMaxScaler()

In [19]:
X_train_scaled = scaler.fit_transform(X_train)

By Fitting, the scaler object learns the parameter required for scaling. Than by transforming, scaler method use those parameter to transform each value of that feature.

In [20]:
X_test_scaled = scaler.transform(X_test)

Based on learnt parameter, we want to scale x test as well so it is just transformed. Otherwise, if it is fit transformed than it will learn new parameter and than apply which will be different leading to loss of uniformity.

In [21]:
#Developing Ridge Regression Model
rid = Ridge(alpha=20.0)
#Fitting the data
rid.fit(X_train_scaled, y_train)
#Predicting the value
rid.predict(X_test_scaled)

array([ 4.45900044e+02,  1.35415068e+03,  5.88590501e+02,  1.06502611e+03,
        7.47180461e+02,  1.24403353e+02,  2.38069489e+02,  1.64707826e+02,
        4.59128000e+02,  5.36210791e+02,  1.68114702e+02,  1.48564446e+03,
        2.03639594e+02,  8.35142771e+02,  6.20864431e+02,  1.83346018e+02,
        9.11852149e+02,  1.86550834e+02,  9.22609358e+02,  2.07136865e+03,
        4.00968514e+02,  2.36547698e+03,  7.81612682e+02,  6.75606475e+02,
        3.85973354e+02,  1.26585624e+03,  4.18417149e+01,  3.22484905e+02,
        3.05209335e+02,  7.63649179e+02,  3.17226218e+01,  1.30311772e+03,
        2.75979586e+02,  1.37942487e+02,  7.09441838e+02,  8.83830468e+02,
        5.55854605e+02, -2.63695221e+00,  1.04822848e+03,  1.20841248e+03,
        1.05263918e+03,  2.51055122e+02,  4.43845515e+02,  3.96129908e+03,
        4.02841711e+02,  3.62447036e+02,  9.26556220e+01,  4.48057656e+02,
        1.46807210e+03,  1.37070914e+03,  1.85138108e+02,  2.15458945e+02,
        6.65721200e+02,  

In [22]:
print("R2 Score : ", rid.score(X_test_scaled, y_test))

R2 Score :  0.5986066019999294


The score has been improved drastically.

### Lasso Regression with Normalization

In [23]:
from sklearn.linear_model import Lasso

In [27]:
#Developing Ridge Regression Model
lass = Lasso(alpha=2.0)
#Fitting the data
lass.fit(X_train_scaled, y_train)
#Predicting the value
lass.predict(X_test_scaled)

array([ 4.40617026e+02,  1.48434745e+03,  6.63967421e+02,  9.97602065e+02,
        6.80199615e+02,  1.46974743e+02,  2.63983743e+02,  1.63966026e+02,
        4.28635408e+02,  5.09308793e+02,  1.22463614e+02,  1.61856020e+03,
        2.00905513e+02,  8.23603404e+02,  6.34423670e+02,  2.37044830e+02,
        9.59348419e+02,  1.55110516e+02,  9.26234944e+02,  2.32252753e+03,
        4.36129719e+02,  2.65999873e+03,  5.91564132e+02,  6.12355294e+02,
        4.77408483e+02,  1.22081882e+03,  6.02936584e+01,  2.72079898e+02,
        3.54152096e+02,  8.90593034e+02,  1.21575998e+02,  1.40990857e+03,
        2.66066004e+02,  1.21279319e+02,  6.72876636e+02,  8.54633228e+02,
        4.38824111e+02,  3.63112027e+01,  9.36909884e+02,  1.27829373e+03,
        9.12315393e+02,  2.36813742e+02,  4.08957777e+02,  3.82832787e+03,
        3.19922994e+02,  3.27974565e+02,  1.37626150e+02,  3.79523061e+02,
        1.69430272e+03,  1.27903565e+03,  9.80235219e+01,  1.89181300e+02,
        5.88004287e+02,  

In [28]:
print("R2 Score : ", lass.score(X_test_scaled, y_test))

R2 Score :  0.6237725857015401


Alpha can be tuned for better optimization

### Polynomial Regression

In [34]:
# synthetic dataset for more complex regression
from sklearn.datasets import make_friedman1
X_F1, y_F1 = make_friedman1(n_samples = 100,
                           n_features = 7, random_state=0)

In [31]:
#Important modules are imported
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [35]:
#Data is divided
X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1, random_state=0)

In [32]:
#Simple Linear regression model
lin = LinearRegression()

In [36]:
#Applying Simple Regression model.
lin.fit(X_train, y_train)
print("Test Score:", lin.score(X_test, y_test))
print("Train Score:", lin.score(X_train, y_train))

Test Score: 0.7221339576925412
Train Score: 0.7223750207373035


Using polynomial regression to get better curve/model

In [38]:
#Polynomial Regression model (This is a preprocessor)
poly = PolynomialFeatures(degree = 2)

In [50]:
#Preprocessing on data
X_F1_poly = poly.fit_transform(X_F1)

In [51]:
#Data is divided again based on new data
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,
                                                   random_state = 0)

In [52]:
#Again, Linear regression is applied.
lin.fit(X_train, y_train)
print("Test Score:", lin.score(X_test, y_test))
print("Train Score:", lin.score(X_train, y_train))

Test Score: 0.8046437550509982
Train Score: 0.9685996830172934


The result obtained is overfitting ( Train >> Test ). So regularization is required

In [55]:
#Applying Ridge regression (Regularization)
rid = Ridge()

In [56]:
rid.fit(X_train, y_train)
print("Test Score:", rid.score(X_test, y_test))
print("Train Score:", rid.score(X_train, y_train))

Test Score: 0.8251115209759516
Train Score: 0.8257620613919463
