In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('UNZIP_FOR_NOTEBOOKS_FINAL/DATA/Advertising.csv')

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## Train | Test Split Procedure 

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Test Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Test Data (by creating predictions and comparing to Y_test)
7. Adjust Parameters as Necessary and repeat steps 5 and 6

In [4]:
x=df.drop('sales',axis=1)
y=df['sales']

In [5]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=101)

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)



In [6]:
from sklearn.linear_model import Ridge
model=Ridge(alpha=1)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [7]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred,y_test)

2.319021579428752

## Train | Validation | Test Split Procedure 

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [8]:
x=df.drop('sales',axis=1)
y=df['sales']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train,x_oth,y_train,y_oth= train_test_split(x,y,test_size=0.3,random_state=101)
x_val,x_test,y_val,y_test= train_test_split(x_oth,y_oth,test_size=0.5,random_state=101)

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_val=scaler.transform(x_val)
x_test=scaler.transform(x_test)

In [13]:
from sklearn.linear_model import Ridge

In [14]:
model=Ridge(alpha=1)
model.fit(x_train,y_train)
y_pred_val=model.predict(x_val)

In [15]:
from sklearn.metrics import mean_squared_error
validation_error=mean_squared_error(y_pred_val,y_val)
print(validation_error)

2.383783075056986


In [16]:
final_predict=model.predict(x_test)
final_test_error=mean_squared_error(final_predict,y_test)
print(final_test_error)

2.2542600838005176


## Cross Validation with cross_val_score

In [17]:
df=pd.read_csv('UNZIP_FOR_NOTEBOOKS_FINAL/DATA/Advertising.csv')
x=df.drop('sales',axis=1)
y=df['sales']

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=101)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [None]:
from

In [34]:
from sklearn.linear_model import Ridge
model=Ridge(alpha=1)

In [35]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model,x_train,y_train,scoring='neg_mean_squared_error',cv=5)
scores

array([-3.15513238, -1.58086982, -5.40455562, -2.21654481, -4.36709384])

In [36]:
abs(scores.mean())

3.344839296530695

In [37]:
# must fit the model again! 需要再另外fit一次，前面得cross_val_score算是做檢測
model.fit(x_train,y_train)
y_pred_final=model.predict(x_test)
print(mean_squared_error(y_pred_final,y_test))

2.319021579428752


## Cross Validation with cross_validate

In [38]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
model=Ridge(alpha=1)
from sklearn.model_selection import cross_validate
scores=cross_validate(model,x_train,y_train,scoring=['neg_mean_squared_error','neg_mean_absolute_error'],cv=10)

In [41]:
scores=pd.DataFrame(scores)
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.003262,0.002574,-2.962508,-1.457174
1,0.00231,0.003234,-3.057378,-1.555308
2,0.002923,0.002518,-2.17374,-1.23877
3,0.002203,0.003171,-0.833034,-0.768938
4,0.001963,0.002003,-3.464018,-1.434489
5,0.002069,0.003001,-8.232647,-1.494316
6,0.002503,0.002187,-1.905864,-1.081362
7,0.001001,0.00268,-2.765048,-1.250011
8,0.0,0.0,-4.989505,-1.580971
9,0.005513,0.002175,-2.846438,-1.223326


In [42]:
scores.mean()

fit_time                        0.002375
score_time                      0.002354
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [43]:
# Need to fit the model first!
model.fit(x_train,y_train)
y_final_test_pred = model.predict(X_test)
print(mean_squared_error(y_final_test_pred,y_test))

2.319021579428752
