In [1]:
# autoMPG
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn, unpatch_sklearn
patch_sklearn()
import sklearn
import matplotlib.pyplot as plt
import matplotlib as mpl
# font to 맑은 고딕
mpl.rc('font', family='Malgun Gothic')
import seaborn as sns

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### 데이터셋 로딩 및 전처리


In [2]:

df = pd.read_csv(url, header=None, sep='\s+')
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [3]:
df['origin'] = df['origin'].astype('category')
df['model_year'] = df['model_year'].apply(lambda x: pd.to_datetime(x, format='%y').year)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float64 
 1   cylinders     398 non-null    int64   
 2   displacement  398 non-null    float64 
 3   horsepower    398 non-null    object  
 4   weight        398 non-null    float64 
 5   acceleration  398 non-null    float64 
 6   model_year    398 non-null    int64   
 7   origin        398 non-null    category
 8   car_name      398 non-null    object  
dtypes: category(1), float64(4), int64(2), object(2)
memory usage: 25.5+ KB


In [4]:
df['horsepower'] = df['horsepower'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df.dropna(inplace=True, axis=0)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,1970,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,1970,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,1970,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,1970,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,1970,1,ford torino
...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86.0,2790.0,15.6,1982,1,ford mustang gl
388,44.0,4,97.0,52.0,2130.0,24.6,1982,2,vw pickup
389,32.0,4,135.0,84.0,2295.0,11.6,1982,1,dodge rampage
390,28.0,4,120.0,79.0,2625.0,18.6,1982,1,ford ranger


In [5]:
corrdf = df.drop([
    'car_name', 
    'origin' , 
    'model_year'
    ], axis=1)
corrdf.corr()['mpg']

mpg             1.000000
cylinders      -0.777618
displacement   -0.805127
horsepower     -0.778427
weight         -0.832244
acceleration    0.423329
Name: mpg, dtype: float64

In [6]:
target = df['mpg']
features = df.drop([
    'mpg', 
    'car_name', 
    'origin' , 
    'model_year', 
    # 'acceleration',
    ], axis=1
                   )
features.tail()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
387,4,140.0,86.0,2790.0,15.6
388,4,97.0,52.0,2130.0,24.6
389,4,135.0,84.0,2295.0,11.6
390,4,120.0,79.0,2625.0,18.6
391,4,119.0,82.0,2720.0,19.4


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

# num = 750
saved = 0
scores_test = []
scores_train = []
# randomforest 59  n_esti 750 
# knn 59 n_neighbors = 5

# for num in range(1,300):
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    test_size=0.2, 
    random_state=59
)

scalerList = [
        ('RBscaler', RobustScaler()),
        ('stdScaler', StandardScaler()),
        ('scalerZ', QuantileTransformer(output_distribution='normal')),
]

regressionList = [
        ('KNN', KNeighborsRegressor(n_neighbors=3)),
        ('linear', LinearRegression()),
        ('randomForest', RandomForestRegressor(n_estimators=750)),
        ('SVM', SVR(C=399)),
]


def trainData(scaler, regression):
        model = Pipeline(
        [
        scaler,
        ('poly', PolynomialFeatures(
            degree=3,
            interaction_only=True,
            include_bias=False,
            )
        ),
        regression
        ]
        )
        model.fit(X_train, y_train)
        from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
        y_pred = model.predict(X_test)
        score = r2_score(y_test, y_pred)
        print(scaler[0], regression[0], end=' ')
        # print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred, squared=False):.2f}')
        # print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}')
        print(f'R2 Score: {score:.4f}')
        
for regression in regressionList:
        for scaler in scalerList:
                trainData(scaler, regression)

RBscaler KNN R2 Score: 0.8617
stdScaler KNN R2 Score: 0.8562
scalerZ KNN R2 Score: 0.8879
RBscaler linear R2 Score: 0.8914
stdScaler linear R2 Score: 0.8914
scalerZ linear R2 Score: 0.8745




ValueError: node array from the pickle has an incompatible dtype:
- expected: {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}
- got     : [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]

In [None]:

# for num in range(1,2):
model = Pipeline(
    [
        # ('RBscaler', RobustScaler()),
        ('stdScaler', StandardScaler()),
        # ('scalerZ', QuantileTransformer(output_distribution='normal')),
        ('poly', PolynomialFeatures(
            degree=3,
            interaction_only=True,
            include_bias=False,
            )
        ),
        # ('KNN', KNeighborsRegressor(n_neighbors=3)),
        ('linear', LinearRegression()),
        # ('randomForest', RandomForestRegressor(n_estimators=750)),
        # ('SVM', SVR(C=399)),
        
    ]
)

model.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred, squared=False):.2f}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}')
print(f'R2 Score: {score:.4f}')
# if saved < score:
#     saved = score
#     saved_num = num

scores_train.append(model.score(X_train, y_train))
scores_test.append(model.score(X_test, y_test))
# print(f'Best num: {saved_num}, Best score: {saved}')


In [None]:
# plt.plot(range(len(scores_train)), scores_train, label='Training Score')
# plt.plot(range(len(scores_test)), scores_test, label='Test Score')
# plt.legend()
# plt.show()



In [None]:
# for idx, data in enumerate(scores_train):
#     tmp = scores_test[idx] - data
#     print(tmp, idx)