# Scikit-Learn

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


df = pd.read_csv("../data/Advertising.csv", index_col=0)

df.head()


Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
number_of_samples, number_of_features = df.shape[0], df.shape[1]-1 #-1 because sales is label and not a feature

number_of_samples, number_of_features


(200, 3)

In [7]:
X, y = df.drop('Sales', axis='columns'), df['Sales']
X.head()

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [10]:
y.head() # Vår label. konvention inom ml community att använda X och y.

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64

## Sklearn - steps
Detta är ett recept som funkar för flera algoritmer

Typical steps
 *   1. train|test split, sometimes train|val|test split if you have large amounts of data
 *   2. scaling? sometimes required?
 *   - min-max scaling
 *   - standardization
 *   - ...
 *   - scale the training data, scale test data to the training data --> avoiding data leakage
 *       # Kan vara skala i statisik, t ex när man transformera till z för att standardisera data
 *   3. Fit algorithm to training data
 *       # Träning, alltså weights and biases. (Kan behövas datakraft från olika cloud kluster)
 *   4. Predict test data
 *       # Man har sina parametrar sen gör man prediction
 *   5. Evaluate
    

## Train|test split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

## Feature scaling

Normalization (min-max feautre scaling)
###  $X' = \frac{X-X_{\min}}{X_{max}-X_{min}}$

(Vi tar vårt data, subtraherar minsta värdet. Sedan tar vi dividerat på största och minsta värdet (För varje feature gör vi detta)
    Man får ut en matris)


###     Feautre standardization
*    ### $X' = \frac{X-\mu}{\sigma}$

Denna känner vi igen från Z-transformering

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Skapar en scaler instance(object)
scaler = MinMaxScaler()
scaler.fit(X_train)  # Important - use this for training data # Handlar om att transformera/skala eller träna
# När man gör fit så anpassar parametrar på ett sätt

# Transformerar min data så jag kan utföra min beräkning där jag får ett värde mellan 0-1
scaled_X_train = scaler.transform(X_train) #transform är normaliseringsberäkningen  # När vi kör transform är det för vi vill ha mellan 0-1
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min()=}")
print(f"{scaled_X_train.max()=}")
print(f"{scaled_X_test.min()=}")
print(f"{scaled_X_test.max()=}")
# Note scaled_X_test.min != 0, scaled_X_test.max != 1

# 0 <= Scaled_X_train <= 1
# .005964214711729622 <= scaled_X_test <= 1.1302186878727631

# I outputen så är X_train mellan 0-1

scaled_X_train.min()=0.0
scaled_X_train.max()=1.0
scaled_X_test.min()=0.005964214711729622
scaled_X_test.max()=1.1302186878727631


In [15]:
scaled_X_test # Ger en numpy array

array([[0.54988164, 0.63709677, 0.52286282],
       [0.65843761, 0.96169355, 0.52286282],
       [0.98816368, 0.57056452, 0.42644135],
       [0.03719986, 0.74395161, 0.44632207],
       [0.74264457, 0.98790323, 0.02882704],
       [0.25160636, 0.70564516, 0.52087475],
       [0.73080825, 0.88508065, 0.26739563],
       [0.16672303, 0.23387097, 0.17992048],
       [0.74974636, 0.06854839, 0.12723658],
       [0.58978695, 0.45362903, 0.31013917],
       [0.10415962, 0.49596774, 0.01888668],
       [0.18769023, 0.11491935, 0.29224652],
       [0.79066622, 0.06854839, 0.83996024],
       [0.01589449, 0.60282258, 0.09045726],
       [0.46939466, 0.04233871, 0.26143141],
       [0.5732161 , 0.15725806, 0.34691849],
       [0.02231992, 0.56653226, 0.40854871],
       [0.66587758, 0.46975806, 0.13817097],
       [0.25228272, 0.40927419, 0.32007952],
       [0.80047345, 0.55443548, 0.10636183],
       [0.77375719, 0.65120968, 0.73459245],
       [0.22691917, 0.73790323, 1.13021869],
       [0.

In [16]:
scaled_X_test.shape, scaled_X_train.shape

((60, 3), (140, 3))

## Linear regression

In [20]:
from sklearn.linear_model import LinearRegression

model_OLS = LinearRegression()
model_OLS.fit(scaled_X_train, y_train)  
# Fit on training data 
# Trivia är att scaled är viktig för gradient descent sen.

print(f"Parameters {model_OLS.coef_}")
# beta1, beta2, beta 3
print(f"Intercept {model_OLS.intercept_}")
# Beta 0



Parameters [13.02832938  9.88465985  0.69237469]
Intercept 2.741855324852814


## Stochastic gradient descent

In [25]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(loss="squared_error", max_iter=1000)
model_SGD.fit(scaled_X_train, y_train)

print(f"Parameters {model_SGD.coef_}")
print(f"Intercept {model_SGD.intercept_}")

Parameters [11.94828864  8.98630699  1.34761791]
Intercept [3.59039629]


## Manual prediction

In [36]:
test_sample_features = scaled_X_test[0].reshape(1, -1)  # Första dimensionen som ett, andra dimensionen ska vi beräkna
test_sample_label = y_test.values[0]
test_sample_features, test_sample_label

(array([[0.54988164, 0.63709677, 0.52286282]]), 16.9)

In [39]:
test_sample_features.shape

(1, 3)

In [38]:
model_OLS.predict(test_sample_features)[0]

16.565396297434837

In [40]:
model_SGD.predict(test_sample_features)[0]

16.590307304079793

In [30]:
X_test.iloc[0].to_numpy() # Dessa är egentligen numrerna vi har från början. Vi måste köra den genom skalaren för att de ska bli samma storlek

array([163.3,  31.6,  52.9])

## Evaluation


In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 1. predict on test data
y_pred_OLS = model_OLS.predict(scaled_X_test)
y_pred_SGD = model_SGD.predict(scaled_X_test)

print(f"{y_pred_OLS[:5]}")
print(f"{y_pred_SGD[:5]}")
# Plockar de fem första motsvarande för sgd

array([16.5653963 , 21.18822792, 21.55107058, 10.88923816, 22.20231988])

In [44]:
y_pred_SGD[:5]

array([16.5903073 , 20.80429161, 21.09920906, 11.3217202 , 21.38017749])

In [45]:
y_test[:5].values

array([16.9, 22.4, 21.4,  7.3, 24.7])

In [51]:
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)



mse_OLS = mean_absolute_error(y_test, y_pred_OLS)
mse_SGD = mean_absolute_error(y_test, y_pred_SGD)

rmse_OLS = np.sqrt(mse_OLS)
rmse_SGD = np.sqrt(mse_SGD)

print(f"{mae_OLS=} \t\t {mse_OLS=:.4f} \t {rmse_OLS=:.4F}")
print(f"{mae_SGD=} \t\t {mse_SGD=:.4f} \t {rmse_SGD=:.4F}")


mae_OLS=1.511669222454909 		 mse_OLS=1.5117 	 rmse_OLS=1.2295
mae_SGD=1.5227972082545738 		 mse_SGD=1.5228 	 rmse_SGD=1.2340
