In [2]:
import pandas as pd
import seaborn as sns

In [3]:
p_df = sns.load_dataset("penguins")

In [4]:
p_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [5]:
p_df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [6]:
p_df = p_df.dropna()
p_df = p_df.drop_duplicates()

In [7]:
#We want to create a model to predict the weight of a penguin based on flipper length and bill depth

X-y split

In [8]:
#define X and Y
X = p_df[["flipper_length_mm","bill_depth_mm"]]
y = p_df[["body_mass_g"]]

Train-test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [11]:
y_test

Unnamed: 0,body_mass_g
30,3250.0
317,4875.0
79,4000.0
201,3675.0
63,4050.0
...,...
247,5650.0
122,3450.0
146,4250.0
182,3200.0


### Model training


In [12]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [13]:
model.coef_

array([[52.20343571, 13.10815533]])

In [46]:
model.intercept_

array([-6509.21617362])

Predict mass of random penguin

In [14]:
random_penguin = X_test.sample()

In [15]:
random_penguin

Unnamed: 0,flipper_length_mm,bill_depth_mm
132,193.0,18.5


In [47]:
model.predict(random_penguin)

array([[3808.54779143]])

Create predictions for test set

In [17]:
y_pred = model.predict(X_test)

In [18]:
y_pred = pd.DataFrame(y_pred)

In [19]:
y_pred

Unnamed: 0,0
0,3001.901576
1,5271.325621
2,3920.819556
3,4053.835184
4,3752.411909
...,...
79,4911.144833
80,2901.427151
81,3653.248300
82,3470.421682


In [20]:
y_test = y_test.reset_index(drop=True)

In [21]:
y_test

Unnamed: 0,body_mass_g
0,3250.0
1,4875.0
2,4000.0
3,3675.0
4,4050.0
...,...
79,5650.0
80,3450.0
81,4250.0
82,3200.0


In [29]:
residuals_df = pd.concat([y_test,y_pred],axis=1)


In [30]:
residuals_df

Unnamed: 0,body_mass_g,0
0,3250.0,3001.901576
1,4875.0,5271.325621
2,4000.0,3920.819556
3,3675.0,4053.835184
4,4050.0,3752.411909
...,...,...
79,5650.0,4911.144833
80,3450.0,2901.427151
81,4250.0,3653.248300
82,3200.0,3470.421682


In [33]:
residuals_df = residuals_df.rename(columns = {"body_mass_g": "y_test", 0:"y_pred"})

In [34]:
residuals_df

Unnamed: 0,y_test,y_pred
0,3250.0,3001.901576
1,4875.0,5271.325621
2,4000.0,3920.819556
3,3675.0,4053.835184
4,4050.0,3752.411909
...,...,...
79,5650.0,4911.144833
80,3450.0,2901.427151
81,4250.0,3653.248300
82,3200.0,3470.421682


In [37]:
residuals_df["residuals"] = residuals_df["y_test"]-residuals_df["y_pred"]

In [38]:
residuals_df.head()

Unnamed: 0,y_test,y_pred,residuals
0,3250.0,3001.901576,248.098424
1,4875.0,5271.325621,-396.325621
2,4000.0,3920.819556,79.180444
3,3675.0,4053.835184,-378.835184
4,4050.0,3752.411909,297.588091


Calculating Metrics

In [39]:
mean_error = residuals_df["residuals"].mean()

In [40]:
mean_error

0.42567127691797524

In [48]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae

In [49]:
mse(y_test,y_pred)

135012.9173072701

In [50]:
mae(y_test,y_pred)

295.239293718512

In [51]:
rmse = mse(y_test,y_pred, squared = False)

In [52]:
rmse

367.4410392257105