In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, minmax_scale
import multiprocessing 
from sklearn.model_selection import GridSearchCV

In [4]:
# 데이터 불러오기
names=['buying','maint','doors','persons','lug_boot','safety','class']
data = pd.read_csv('car.data')

In [5]:
data.columns = names

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4   lug_boot  1727 non-null   object
 5   safety    1727 non-null   object
 6   class     1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [55]:
data['class'].value_counts()

unacc    1209
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [7]:
# 데이터 구분
X_f = data.drop(['class'],axis =1)
y_f = data['class']

In [8]:
X_f.shape

(1727, 6)

In [9]:
X_f_d = pd.get_dummies(X_f)

In [10]:
X_f_d.shape

(1727, 21)

In [11]:
y_f.shape

(1727,)

In [12]:
# 종속변수 및 독립변수 지정
X = X_f_d.values
y = y_f.values

In [13]:
# 훈련, 평가 세트 분리
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [27]:
# 로지스틱 회귀를 이용한 분류
model = LogisticRegression(multi_class='auto', C = 1000, random_state=1, max_iter=1000)

In [28]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1000, max_iter=1000, random_state=1)

In [30]:
print('훈련 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

훈련 데이터 점수: 0.949034749034749
평가 데이터 점수: 0.9351851851851852


In [31]:
param_grid = [{'penalty' : ["l1", "l2"],
               'C' : [0.01,0.1,1.0,10,100,1000]}]
              
gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid,
                 scoring='accuracy', cv=12, n_jobs=multiprocessing.cpu_count())
gs.fit(X,y)

        nan 0.85860609        nan 0.86904704        nan 0.87370905]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=12, estimator=LogisticRegression(), n_jobs=12,
             param_grid=[{'C': [0.01, 0.1, 1.0, 10, 100, 1000],
                          'penalty': ['l1', 'l2']}],
             scoring='accuracy')

In [32]:
gs.best_estimator_

LogisticRegression(C=1000)

In [None]:
# SVC를 이용한 분류

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
model = SVC()
model.fit(X_train, y_train)

SVC()

In [35]:
print("훈련 데이터 점수 : {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수 : {}".format(model.score(X_test, y_test)))

훈련 데이터 점수 : 0.9915057915057915
평가 데이터 점수 : 0.9745370370370371


In [36]:
# GriGridSearchCV를 활용해 적합한 모델 kernel 찾기
from sklearn.pipeline import make_pipeline, Pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVC(kernel='rbf'))])

param_grid = [{'model__kernel' : ['rbf', 'polynomial', 'sigmoid']}]  ## 커널값을 비교해보자

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv =5,
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', SVC())]),
             n_jobs=12,
             param_grid=[{'model__kernel': ['rbf', 'polynomial', 'sigmoid']}],
             verbose=True)

In [37]:
gs.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('model', SVC())])

In [38]:
# GriGridSearchCV를 활용해 나온 rbf를 활용해 최적의 파라미터 찾기
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVC(kernel='rbf'))])

param_grid = [{'model__gamma' : ['scale', 'auto']},
              {'model__C' : [1.0, 0.1, 0.01]}] 

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv =5,
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', SVC())]),
             n_jobs=12,
             param_grid=[{'model__gamma': ['scale', 'auto']},
                         {'model__C': [1.0, 0.1, 0.01]}],
             verbose=True)

In [39]:
gs.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('model', SVC())])