# Lab | Comparing regression models

For this lab, we will be using the same dataset we used in the previous labs. We recommend using the same notebook since you will be reusing the same variables you previous created and used in labs.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('we_fn_use_c_marketing_customer_value_analysis.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9134 entries, 0 to 9133
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer                       9134 non-null   object 
 1   State                          9134 non-null   object 
 2   Customer Lifetime Value        9134 non-null   float64
 3   Response                       9134 non-null   object 
 4   Coverage                       9134 non-null   object 
 5   Education                      9134 non-null   object 
 6   Effective To Date              9134 non-null   object 
 7   EmploymentStatus               9134 non-null   object 
 8   Gender                         9134 non-null   object 
 9   Income                         9134 non-null   int64  
 10  Location Code                  9134 non-null   object 
 11  Marital Status                 9134 non-null   object 
 12  Monthly Premium Auto           9134 non-null   i

In [3]:
data = data.set_index('Customer')

### 1. In this final lab, we will model our data. Import sklearn train_test_split and separate the data.


In [4]:
y = data['Total Claim Amount']
X = data.drop(['Total Claim Amount'],axis=1)

In [5]:
numericals = X.select_dtypes(np.number)

In [6]:
transformer = StandardScaler().fit(numericals)
x_standardized = transformer.transform(numericals)

In [7]:
categoricals = X.select_dtypes(exclude=np.number)

In [8]:
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals)
encoded = encoder.transform(categoricals).toarray()

In [9]:
X = np.concatenate((x_standardized, encoded), axis=1)

array([[-0.76287773,  0.61282744, -0.70392465, ...,  1.        ,
         1.        ,  0.        ],
       [-0.14924546, -1.23961684,  0.02269103, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.71063622,  0.36570978,  0.42959581, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [ 0.02313483, -1.23961684, -0.23889062, ...,  0.        ,
         1.        ,  0.        ],
       [-0.06993547, -0.51735647,  0.08082028, ...,  0.        ,
         0.        ,  0.        ],
       [-0.78495478, -1.23961684, -0.47140763, ...,  1.        ,
         1.        ,  0.        ]])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

### 2. Try a simple linear regression with all the data to see whether we are getting good results.

In [11]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [12]:
predictions = model.predict(X_test)

In [13]:
r2_score(y_test, predictions), mean_absolute_error(y_test, predictions), mean_squared_error(y_test, predictions, squared=False)

(0.7675811515892831, 96.47905444399554, 136.06126801796373)

### 3. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

In [12]:
def regression_model ():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    if LR:
        model = LinearRegression()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        return (r2_score(y_test, predictions))

    if KNN:
        model = KNeighborsRegressor(n_neighbors=4)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        return model.score(X_test, y_test)

    if MLP:
        model = MLPRegressor()
        model.fit(X_train, y_train)
        expected_y  = y_test
        predicted_y = model.predict(X_test)
        return metrics.r2_score(expected_y, predicted_y) 


In [None]:
#STUCK ..functions still major weak spot for me

In [14]:
regression_model(LR)

TypeError: regression_model() takes 0 positional arguments but 1 was given

### 4. Use the function to check LinearRegressor and KNeighborsRegressor.

### 5. You can check also the MLPRegressor for this task!

### 6. Check and discuss the results.