<a href="https://colab.research.google.com/github/Neoneto/CodingDojo_Week5/blob/main/Abalone_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Abalone Linear Regression Exercise(Core)
Submitted by Kenneth Alaba

## Pre-requisites

In [1]:
# import libraries
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


In [2]:
# Load the data

## Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

## Opening the file
filename = '/content/drive/My Drive/Coding Dojo/05 Week 5: Intro to Machine Learning/abalone.data'

## Storing the data in df
df = pd.read_csv(filename, header = None)

# display first few rows
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
# Set column names

df.rename(columns = {0: 'sex',
                     1: 'length',
                     2: 'diameter',
                     3: 'height',
                     4: 'whole_weight',
                     5: 'shucked_weight',
                     6: 'viscera_weight',
                     7: 'shell_weight',
                     8: 'rings'}, inplace = True)

df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


No missing values. Also, only the column sex is nominal/categorical while the rest are numerical

## Train Test Split

In [5]:
# set the features and target
X = df.drop(columns=['rings'])
y = df['rings']

In [6]:
y.head()

0    15
1     7
2     9
3    10
4     7
Name: rings, dtype: int64

In [7]:
# Split into train and test sets
X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state=42)

## Create a Pipeline

In [8]:
# create a pipeline


## Column transformation

### Select categorical and numerical columns
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

### Instantiate Transformers
mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

### Make Pipelines for each column type
num_pipe = make_pipeline(mean_imputer, scaler)
cat_pipe = make_pipeline(freq_imputer, ohe_encoder)

### Match pipe to column
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

### Make column Transformer
column_transformer = make_column_transformer(num_tuple, cat_tuple)


## Linear regression

# instantiate the linear regression model
lin_reg = LinearRegression()



# instantiate the pipeline with  1) column transformations and a 2) linear regression model.
pipe = make_pipeline(column_transformer, lin_reg)


# Pipeline adapted from http://learn.codingdojo.com/m/213/7153/67078

## Fit the pipeline

In [9]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f2b8a9516d0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncode

## Evaluate the model

In [10]:
# Calculate R^2 value of the model onthe train set
r2_train = r2_score(y_train, pipe.predict(X_train))
print(f'Train R^2 value: {r2_train:.3f}')

Train R^2 value: 0.534


In [11]:
# Calculate R^2 value of the model onthe test set
r2_test = r2_score(y_test, pipe.predict(X_test))
print(f'Test R^2 value: {r2_test:.3f}')

Test R^2 value: 0.545
