In [1]:
from google.cloud import storage
import pandas as pd

## Download Data

In [2]:
BUCKET = "machinelearning-hw"
FILENAME = "fish_participant.csv"
client = storage.Client()
bucket = client.get_bucket(BUCKET)
blob = bucket.blob(FILENAME)
blob.download_to_filename(FILENAME)

In [3]:
df = pd.read_csv(FILENAME)
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,430.0,26.5,29.0,34.0,12.444,5.134
1,Perch,110.0,20.0,22.0,23.5,5.5225,3.995
2,Roach,160.0,20.5,22.5,25.3,7.0334,3.8203
3,Parkki,60.0,14.3,15.5,17.4,6.5772,2.3142
4,Bream,700.0,30.4,33.0,38.3,14.8604,5.2854


## Model LinearRegression

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
y = df["Weight"].values
X = df[["Length1", "Length2", "Length3", "Height", "Width"]].values

In [6]:
lr = LinearRegression()
lr.fit(X, y)
lr.predict([[20.5, 29.0, 31.0, 10.4, 5.1]])

array([-17.17927306])

## Model Multi-layer Perceptron

In [7]:
from sklearn.neural_network import MLPRegressor

In [8]:
y = df["Weight"].values
X = df[["Length1", "Length2", "Length3", "Height", "Width"]].values

In [9]:
mlpr = MLPRegressor()
mlpr.fit(X, y)
mlpr.predict([[20.5, 29.0, 31.0, 10.4, 5.1]])



array([207.59587254])

## Split Data into Train and Test

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
y = df["Weight"].values
X = df[["Length1", "Length2", "Length3", "Height", "Width"]].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### 1st Re_Model LinearRegression

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.predict([[20.5, 29.0, 31.0, 10.4, 5.1]])

array([141.19647059])

#### 1st Re_Model Multi-layer Perceptron

In [14]:
mlpr = MLPRegressor()
mlpr.fit(X_train, y_train)
mlpr.predict([[20.5, 29.0, 31.0, 10.4, 5.1]])



array([275.81546894])

## MSE with SplitData

In [15]:
from sklearn.metrics import mean_squared_error

#### 2nd Re_Model LinearRegression

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_squared_error(y_test, y_pred)

10301.13282874212

#### 2nd Re_Model Multi-layer Perceptron

In [17]:
mlpr = MLPRegressor()
mlpr.fit(X_train, y_train)
y_pred = mlpr.predict(X_test)
mean_squared_error(y_test, y_pred)



94698.42792701657

## Cross_validate

In [18]:
from sklearn.model_selection import cross_validate

#### Validate LinearRegression

In [19]:
lr = LinearRegression()
cross_validate(lr, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-15855.981402209049

#### Validate Multi-layer Perceptron

In [20]:
mlpr = MLPRegressor()
cross_validate(mlpr, X, y, scoring="neg_mean_squared_error")["test_score"].mean()



-81335.96314515262

## StandardScaler and Pipeline

###### Standardize features by removing the mean and scaling to unit variance

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [22]:
y = df["Weight"].values
X = df[["Length1", "Length2", "Length3", "Height", "Width"]].values

#### Scaled LinearRegression

In [23]:
lr_scaled = make_pipeline(
    StandardScaler(),
    LinearRegression()
)
cross_validate(lr_scaled, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-15855.981402209063

#### Scaled Multi-layer Perceptron

In [24]:
mlpr_scaled = make_pipeline(
    StandardScaler(),
    MLPRegressor()
)
cross_validate(mlpr_scaled, X, y, scoring="neg_mean_squared_error")["test_score"].mean()



-255933.58489729642

## Decision Tree

In [25]:
from sklearn.tree import DecisionTreeRegressor

In [26]:
dt = DecisionTreeRegressor()
cross_validate(dt, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-11251.538660079052

## Random Forest

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
rf = RandomForestRegressor()
cross_validate(rf, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-6770.035036628457

#### PCA

In [29]:
from sklearn.decomposition import PCA

In [30]:
pca_regression = make_pipeline(
    PCA(3),
    LinearRegression()
)
cross_validate(pca_regression, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-15921.73021267204

## Lasso Regression

In [31]:
from sklearn.linear_model import LassoCV

In [32]:
lasso =LassoCV()
cross_validate(lasso, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive

-18433.042749348726

### Scaled Lasso Regression

In [33]:
lasso_scaled = make_pipeline(
    StandardScaler(),
    LassoCV()
)
cross_validate(lasso_scaled, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

  positive)
  positive)
  positive)
  positive)


-16007.066690301543

## Null Model

###### To see what the worst model/situation is

In [34]:
from sklearn.dummy import DummyRegressor

In [35]:
dr = DummyRegressor()
cross_validate(dr, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-114060.48235442504

## Retrain Model with All Data

In [36]:
from joblib import dump, load

In [37]:
# Random Forest with Cross_validation
rf = RandomForestRegressor()
rf.fit(X,y)
dump(rf, "rfmodel.joblib")
rf.predict([[18.7, 20.35, 22.8, 8.9928, 3.3928]])

array([131.05])

In [38]:
# Scaled LinearRegression with Cross_validation
lr_scaled = make_pipeline(
    StandardScaler(),
    LinearRegression()
)
lr_scaled.fit(X,y)
dump(lr_scaled, "scaledlrmodel.joblib")
rf.predict([[26.5, 29.0, 34.0, 12.44, 5.134]])

array([417.29])

In [39]:
# Scaled Lasso with Cross_validation
lasso_scaled = make_pipeline(
    StandardScaler(),
    LassoCV()
)
lasso_scaled.fit(X,y)
dump(lasso_scaled, "Scaledlassomodel.joblib")
lasso_scaled.predict([[26.5, 29.0, 34.0, 12.44, 5.134]])

  positive)


array([464.94926112])

In [40]:
# Lasso with Cross_validation
lasso =LassoCV()
lasso.fit(X,y)
dump(lasso, "lassomodel.joblib")
lasso.predict([[26.5, 29.0, 34.0, 12.44, 5.134]])

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


array([462.04633897])

In [41]:
# PCA 
pca_regression = make_pipeline(
    PCA(3),
    LinearRegression()
)
pca_regression.fit(X,y)
dump(pca_regression, "pcamodel.joblib")
pca_regression.predict([[26.5, 29.0, 34.0, 12.44, 5.134]])

array([464.98142411])

In [42]:
# Decision Tree
dt = DecisionTreeRegressor()
dt.fit(X,y)
dump(dt, "dtmodel.joblib")
dt.predict([[18.7, 20.35, 22.8, 8.9928, 3.3928]])

array([150.])

In [43]:
# Scaled mlpr
mlpr_scaled = make_pipeline(
    StandardScaler(),
    MLPRegressor()
)
mlpr_scaled.fit(X,y)
dump(mlpr_scaled, "scaledmlprmodel.joblib")
mlpr_scaled.predict([[26.5, 29.0, 34.0, 12.44, 5.134]])



array([11.44157841])

In [44]:
# mlpr
mlpr = MLPRegressor()
mlpr.fit(X,y)
dump(mlpr, "mlprmodel.joblib")
mlpr.predict([[26.5, 29.0, 34.0, 12.44, 5.134]])



array([289.23739379])

In [45]:
# LR
lr = LinearRegression()
lr.fit(X,y)
dump(lr, "lrmodel.joblib")
lr.predict([[26.5, 29.0, 34.0, 12.44, 5.134]])

array([445.37319157])