In [None]:
import pandas as pd
import seaborn as sns

## Explore the data

Suppose we want to predict `acceleration` given the other numerical characteristics of a car

In [None]:
df = pd.read_csv("../datasets/auto-mpg.csv")

In [None]:
df.shape

In [None]:
df = df.drop("car name", axis=1)

In [None]:
df.head()

In [None]:
df.dtypes

Horsepower seems not to be a numeric column. What is going on?

In [None]:
for hp in df.horsepower:
    try:
        int(hp)
    except:
        print(hp)

In [None]:
# drop ? horsepowers
df = df[df.horsepower != "?"]

In [None]:
df.shape

In [None]:
df.horsepower = df.horsepower.astype(float)

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.head()

## Build model with whole dataset

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
X = df.drop("acceleration", axis=1)
y = df.acceleration

In [None]:
X[:5]

In [None]:
y[:5]

In [None]:
lr.fit(X=X, y=y)

In [None]:
y[:1]

In [None]:
lr.predict(X[:1])

In [None]:
y[:5]

In [None]:
lr.predict(X[:5]).round(1)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_true=y, y_pred=lr.predict(X))

In [None]:
# r2 is the default score for LinearRegression
lr.score(X, y)

In [None]:
pd.DataFrame({"real": y, "pred": lr.predict(X).round(1)}).head()

In [None]:
sns.scatterplot(x=y, y=lr.predict(X))

## Train-test split procedure

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
lr2 = LinearRegression()

In [None]:
lr2.fit(X_train, y_train)

In [None]:
lr2.score(X_train, y_train)

In [None]:
sns.scatterplot(x=y_train, y=lr2.predict(X_train))

In [None]:
X_test.shape

In [None]:
X_test.head()

In [None]:
lr2.predict(X_test.head()).round(1)

In [None]:
y_test[:5]

In [None]:
lr2.score(X_test, y_test)

This is the expected error we will have when testing on OTHER UNSEEN cars

In [None]:
sns.scatterplot(x=y_test, y=lr2.predict(X_test))

EXTRA: CROSS VALIDATION to consider different train-test splits