In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler


fmt = '{:<13} {:}'
DIRECTORY = os.getcwd()

for root, dirs, files in os.walk(DIRECTORY):
    for file in files:
        if file.endswith(".csv"):
            csv = file

df = pd.read_csv(csv)
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [13]:
import matplotlib.pyplot as plt
acceleration = df['acceleration']
weight = df['weight']
mpg = df['mpg']
model_year = df['model year']

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

train_set, test_set = train_test_split(df, 
                        test_size=0.2, random_state=123)
print('Train size: ', len(train_set), 'Test size: ', len(test_set))

X = train_set[['acceleration', 'weight']]
y = train_set['weight']

X_test = test_set[['acceleration', 'weight']]
y_test = test_set['weight']

lr_model = LinearRegression()
lr_model.fit(X,y)

y_pred = lr_model.predict(X)
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lr_model.intercept_)
print('   Coefficients', lr_model.coef_)
print('   Score', lr_model.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))


y_test_pred = lr_model.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Train size:  318 Test size:  80
Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  0.0
   Coefficients [-2.96116483e-14  1.00000000e+00]
   Score 1.0
MAE is   2.5382910308914273e-13
RMSE is  4.172740312397479e-13
MSE is  1.7411761714707008e-25
R^2     1.0

Results for linear regression on test data
MAE is   2.1884716261411086e-13
RMSE is  3.6574665369119914e-13
MSE is  1.3377061468630994e-25
R^2     1.0


In [4]:
X = train_set[['height']]
y = train_set['weight']

X_test = test_set[['height']]
y_test = test_set['weight']

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
poly4 = PolynomialFeatures(degree=4, include_bias=False)
scale = StandardScaler()

lr_model = LinearRegression()

stages = [
    ('imp_mean', imp_mean),
    ('poly4', poly4),
    ('scale', scale),
    ('lr_model', lr_model),
]
pipe_model = Pipeline(stages)

pipe_model.fit(X,y)

y_pred = pipe_model.predict(X)
print('Results for pipeline linear regression on training data')
#print(' Default settings')
#print('Internal parameters:')
print(' Bias is ', pipe_model.predict([[0]]))
#print(' Coefficients', pipe_model.coef_)
print(' Score', pipe_model.score(X,y))
print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

y_test_pred = pipe_model.predict(X_test)
print()
print('Results for pipeline linear regression on test data')
print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))


KeyError: "None of [Index(['height'], dtype='object')] are in the [columns]"

In [None]:
y_pred = pipe_model.predict(X)
print('\nResults for pipeline linear regression on training data:')
print(fmt.format('MAE:', round(mean_absolute_error(y, y_pred), 2)))
print(fmt.format('RMSE:', round(np.sqrt(mean_squared_error(y, y_pred)),2)))
print(fmt.format('MSE:', round(mean_squared_error(y, y_pred), 2)))
print(fmt.format('R^2:', round((r2_score(y,y_pred)*100), 2)))

y_test_pred = pipe_model.predict(X_test)
print('\nResults for pipeline linear regression on test data:')
print(fmt.format('MAE:', round(mean_absolute_error(y_test, y_test_pred), 2)))
print(fmt.format('RMSE:', round(np.sqrt(mean_squared_error(y_test, y_test_pred)),2)))
print(fmt.format('MSE:', round(mean_squared_error(y_test, y_test_pred), 2)))
print(fmt.format('R^2:', round((r2_score(y_test,y_test_pred)*100), 2)))