In [1]:
import pandas as pd

In [11]:
df = pd.read_csv("../datasets/auto-mpg.csv")

In [12]:
df.shape

(398, 9)

Suppose we want to predict `acceleration` given the other numerical characteristics of a car

In [17]:
df = df.drop("car name", axis=1)

In [18]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


Reminder: better ask for forgiveness than for permission

In [39]:
def to_mayus(palabra):
    return palabra.upper()

In [40]:
to_mayus("hola")

'HOLA'

In [41]:
to_mayus(4)

AttributeError: 'int' object has no attribute 'upper'

In [42]:
# ask permission
def to_mayus(palabra):
    if isinstance(palabra, str):
        return palabra.upper()
    else:
        return None

In [46]:
# equiv working
# ask forgiveness
def to_mayus(palabra):
    try:
        return palabra.upper()
    except:
        return None

In [47]:
to_mayus("hola")

'HOLA'

In [48]:
to_mayus(4)

In [49]:
for hp in df.horsepower:
    try:
        int(hp)
    except:
        print(hp)

?
?
?
?
?
?


In [54]:
# drop ? horsepowers
df = df[df.horsepower != "?"]

In [55]:
df.shape

(392, 8)

In [56]:
df.horsepower = df.horsepower.astype(float)

In [57]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin            int64
dtype: object

In [58]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1


In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
lr = LinearRegression()

In [73]:
predictors = df.drop("acceleration", axis=1)
target = df.acceleration

In [74]:
predictors[:5]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,model year,origin
0,18.0,8,307.0,130.0,3504,70,1
1,15.0,8,350.0,165.0,3693,70,1
2,18.0,8,318.0,150.0,3436,70,1
3,16.0,8,304.0,150.0,3433,70,1
4,17.0,8,302.0,140.0,3449,70,1


In [75]:
target[:5]

0    12.0
1    11.5
2    11.0
3    12.0
4    10.5
Name: acceleration, dtype: float64

In [76]:
lr.fit(X=predictors, y=target)

LinearRegression()

In [90]:
predictors[:1]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,model year,origin
0,18.0,8,307.0,130.0,3504,70,1


In [93]:
lr.predict(predictors[:1])

array([14.00670318])

In [91]:
lr.predict(predictors[:5])

array([14.00670318, 11.21709765, 11.9784462 , 12.04025895, 12.98764533])

In [94]:
# r2 is the default score for LinearRegression
lr.score(predictors, target)

0.619822478208412

In [95]:
from sklearn.metrics import mean_squared_error

In [97]:
mean_squared_error(y_true=target, y_pred=lr.predict(predictors))

2.8862752656774058

In [98]:
from sklearn.model_selection import train_test_split

In [99]:
df.shape

(392, 8)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2)

In [102]:
X_train.shape

(313, 7)

In [104]:
X_test.shape

(79, 7)

In [107]:
lr2 = LinearRegression()

In [108]:
lr2.fit(X_train, y_train)

LinearRegression()

In [109]:
lr2.score(X_train, y_train)

0.6281968961067281

In [111]:
X_test.shape

(79, 7)

In [110]:
lr2.predict(X_test)

array([15.55660622, 14.05697944, 13.5995442 , 16.49621584, 13.07400523,
       17.04563036, 16.44262571, 15.69479251, 16.46906091, 17.2692634 ,
       17.92809981, 17.13726649, 18.16796471, 16.63747376, 15.27882328,
       14.92780932, 18.15612037, 17.83138113, 15.98025108, 16.2747384 ,
       17.31873432,  9.56892557, 14.48570371, 15.34151368, 15.96703829,
       16.40219959, 16.15883062, 16.61089109, 13.9743187 , 15.52457688,
       14.00271577, 14.86887391, 13.82957565, 12.08205617, 17.51831434,
       16.60626929, 17.23755839,  8.64160566, 16.9642764 , 17.04056377,
       13.84813335, 18.01832142, 15.24595969, 17.16629535, 13.1008856 ,
       16.28070346, 17.0817729 , 16.74662227, 17.49404663, 15.93242758,
       16.64742524, 16.85136424, 16.83725368, 12.39197617, 15.6572548 ,
       14.80787326, 17.34545661, 16.83480319, 16.4881528 , 16.94275041,
       14.15949636, 13.6920892 , 15.62403913, 14.34765545, 13.40403129,
       16.94736631, 15.01717401, 16.42905102, 13.94519   , 12.29

In [113]:
y_test[:5]

241    14.5
0      12.0
221    12.5
321    15.2
4      10.5
Name: acceleration, dtype: float64

In [114]:
lr2.score(X_test, y_test)

0.5607210405224281

This is the expected error we will have when testing on OTHER UNSEEN cars