In [34]:
import numpy as np
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [7]:
autompg_headers = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

In [10]:
autompg_data = read_csv('C:/dataset/auto_mpg_data.csv', names=autompg_headers)

In [11]:
check_missing_data = autompg_data[autompg_data.isna().any(axis=1)]

In [12]:
autompg_data['horsepower'] = pd.to_numeric(autompg_data['horsepower'], errors='coerce')
missing_data = autompg_data[autompg_data.isna().any(axis=1)]

In [13]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(autompg_data)
new_data = imputer.transform(autompg_data)

In [14]:
autompg_data = pd.DataFrame(data=new_data, columns=autompg_headers)
train_headers = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
target_header = ['mpg']

In [15]:
X = autompg_data[train_headers]
y = autompg_data[target_header]

In [16]:
X = autompg_data.iloc[:, 1:8].values

In [17]:
y = autompg_data.iloc[:, 0].values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [20]:
print('Total of Training dataset', X_train.shape)
print('Total of Test dataset', X_test.shape)

Total of Training dataset (278, 7)
Total of Test dataset (120, 7)


In [21]:
lr_regress = LinearRegression()

In [22]:
lr_regress.fit(X_train, y_train)

LinearRegression()

In [31]:
y_pred = lr_regress.predict(X_test)
print(y_pred)

[21.35631026 26.4130424  20.1108578  25.55252705 24.30988933 15.72647328
 28.55919173 34.86967754 16.92024105 10.72225108 30.54233464 16.61792727
 22.35433985 26.21346721 36.78716663 22.4609351  10.75487685 20.51715336
  8.88875185 33.52440759 26.67727992 31.02164727 21.17110807 25.88906002
 26.14410783 28.78377878 32.91050731 33.02523622 15.19634693 30.33555308
 27.41881633 10.81882473 20.39544913 29.08021692 25.31332439 13.70240068
 27.0925503   9.04779814 31.81101585 24.37101273 24.74448442 24.92841424
 20.79680786 32.62389939 26.23263757 22.24191958 21.20604938 11.71134882
 29.09864405 19.13244458 25.09203451 27.00347606 16.75729609 12.07840734
 28.96849297 25.71203581 10.19646714 13.03309166 31.08597596 36.05025529
 35.02475308 36.11119004 17.8501593  27.69232611 20.21319044 32.01598228
 26.84739493 26.59461606 29.97053984 12.49131085 20.09308177 23.18697057
 14.3927472  26.76746287 11.77924906 26.0319751  26.92306883 16.37764452
 11.74725013 23.19375959 21.01668254 19.96280162 35

In [32]:
y_pred_train = lr_regress.predict(X_train)

In [35]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print('\nSlope: ', lr_regress.coef_)
print('Intercept: ', lr_regress.intercept_)
print('Mean Absolute Error: {:.2f}'.format(mae))
print('Mean Squared Error: {:.2f}'.format(mse))
print('Root Mean Squared Error: {:.2f}'.format(rmse))
print('R2 Score: ', r2)


Slope:  [-0.39912226  0.02232034 -0.014257   -0.0071813   0.08704964  0.79702551
  1.15658328]
Intercept:  -19.449513538151333
Mean Absolute Error: 2.31
Mean Squared Error: 8.97
Root Mean Squared Error: 2.99
R2 Score:  0.8466472315667264
