In [None]:
import numpy as np
import pandas as pd

mandarine_consume = pd.read_excel('fresh_consume.xlsx', index_col='년도')
mandarine_consume.describe()
mandarine_consume.info()

mandarine_consume.columns = ['per_p_man_con','per_p_fruit_con', 'per_p_imp_con','one_house',
                            'import_ratio_f','import_ratio_p','man_whole_p','oran_whole_p',
                            'apple_whole_p','pear_whole_p','strb_whole_p','persim_whole_p','banana_whole_p',
                            'kiwi_whole_p', 'grap_whole_p','prod_index','consum_index',
                            'per_p_income', 'import_ton_fruit', 'population']
drop_cols = ['import_ton_fruit','population','consum_index','prod_index']
mandarine_consume_df = mandarine_consume.drop(columns=drop_cols)
mandarine_consume_df.import_ratio_p = mandarine_consume_df.import_ratio_p.fillna(method='ffill')
mandarine_consume_df.import_ratio_f = mandarine_consume_df.import_ratio_f.fillna(method='ffill')
mandarine_consume_df.import_ratio_p = mandarine_consume_df.import_ratio_p.fillna(0)
mandarine_consume_df.import_ratio_f = mandarine_consume_df.import_ratio_f.fillna(0)

mandarine_consume_df2 = mandarine_consume_df.dropna()
mandarine_consume_df2

In [46]:
# 변수 선택
# 1. 다중공선성
y = mandarine_consume_df2.iloc[:,0]
X = mandarine_consume_df2.iloc[:,1:]

년도
2000    11.9
2001    12.6
2002    13.3
2003    13.0
2004    12.0
2005    13.1
2006    12.7
2007    16.0
2008    13.0
2009    15.2
2010    12.4
2011    13.6
2012    13.7
2013    13.4
2014    14.2
2015    12.5
2016    11.9
2017    11.6
Name: per_p_man_con, dtype: float64

In [31]:
# 2. VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif

Unnamed: 0,VIF Factor,features
0,959.310004,per_p_fruit_con
1,4014.749646,per_p_imp_con
2,355.56848,one_house
3,1087.960682,import_ratio_f
4,672.546551,import_ratio_p
5,287.143147,man_whole_p
6,431.040092,oran_whole_p
7,355.716682,apple_whole_p
8,104.236253,pear_whole_p
9,887.654598,strb_whole_p


In [33]:
# 3. feature selection (RFE)
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

estimator = SVR(kernel='linear')
selector = RFE(estimator, step=1)
selector = selector.fit(X, y)

X.columns[selector.support_]
X_RFE = X[X.columns[selector.support_]]
# RFE로 변수 선택함!
# 선택된 변수들
X.columns[selector.support_]

Index(['per_p_fruit_con', 'per_p_imp_con', 'one_house', 'import_ratio_f',
       'import_ratio_p', 'man_whole_p', 'oran_whole_p'],
      dtype='object')

In [None]:
# 다중공선성 시각화
# version 1
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid', context='notebook')
sns.pairplot(X, height=2.5)
plt.show()
sns.reset_orig()

# version 2
plt.figure(figsize=(10,10))
sns.heatmap(data=X.corr(), annot=True, square=True, fmt='.2f', linewidths=.5, cmap='Blues')

In [34]:
X_RFE

from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale, StandardScaler
scale_X = scale(X_RFE)
robust_X = robust_scale(X_RFE)
minmax_X = minmax_scale(X_RFE)
standard_X = StandardScaler().fit(X_RFE)

In [35]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

X_train, X_test, y_train, y_test = train_test_split(minmax_X, y, random_state=3)
result1 = sm.OLS(y_train, X_train).fit()
print(result1.summary())

                                 OLS Regression Results                                
Dep. Variable:          per_p_man_con   R-squared (uncentered):                   0.930
Model:                            OLS   Adj. R-squared (uncentered):              0.849
Method:                 Least Squares   F-statistic:                              11.46
Date:                Wed, 22 Jan 2020   Prob (F-statistic):                     0.00426
Time:                        14:59:39   Log-Likelihood:                         -34.630
No. Observations:                  13   AIC:                                      83.26
Df Residuals:                       6   BIC:                                      87.21
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

  "anyway, n=%i" % int(n))


In [36]:
X_RFE2 = X_RFE.drop(columns=['per_p_fruit_con','per_p_imp_con','one_house'])
minmax_X2 = minmax_scale(X_RFE2)

X_train, X_test, y_train, y_test = train_test_split(minmax_X2, y, random_state=3)
result2 = sm.OLS(y_train, X_train).fit()
print(result2.summary())

                                 OLS Regression Results                                
Dep. Variable:          per_p_man_con   R-squared (uncentered):                   0.914
Model:                            OLS   Adj. R-squared (uncentered):              0.876
Method:                 Least Squares   F-statistic:                              23.99
Date:                Wed, 22 Jan 2020   Prob (F-statistic):                    8.09e-05
Time:                        14:59:41   Log-Likelihood:                         -35.987
No. Observations:                  13   AIC:                                      79.97
Df Residuals:                       9   BIC:                                      82.23
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

  "anyway, n=%i" % int(n))


In [37]:
# 단순선형회귀
from sklearn.pipeline import Pipeline
#from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True)
model = model.fit(X_train, y_train)
y_bar = model.predict(X_train)
y_pred = model.predict(X_test)
y_pred - y_test

from sklearn.metrics import r2_score
print("훈련 데이터 r2 score", r2_score(y_train, y_bar))
print("테스트 데이터 r2 score", r2_score(y_test, y_pred))
print("가중치: ", model.coef_)
print("절편: ", model.intercept_)

훈련 데이터 r2 score 0.6038647148656431
테스트 데이터 r2 score -0.5642636900929876
가중치:  [-1.43013228 -1.40617145 -1.42854015  2.84006351]
절편:  12.903523754001597


In [47]:
# 단순선형회귀 (다른 변수를 넣어보자...)
y = mandarine_consume_df2.iloc[:,0]
X = mandarine_consume_df2.iloc[:,1:]
y = minmax_scale(y)
X = minmax_scale(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True)
model = model.fit(X_train, y_train)
y_bar = model.predict(X_train)
y_pred = model.predict(X_test)
y_pred - y_test

from sklearn.metrics import r2_score
print("훈련 데이터 r2 score", r2_score(y_train, y_bar))
print("테스트 데이터 r2 score", r2_score(y_test, y_pred))
print("가중치: ", model.coef_)
print("절편: ", model.intercept_)

훈련 데이터 r2 score 1.0
테스트 데이터 r2 score -21.02204219207821
가중치:  [ 0.14768616  0.12535092  1.05689794  0.21957344 -1.13941744  0.09704595
  1.42574357  0.19958879  1.41842752 -0.57253453 -0.92422387  0.67911597
 -0.59660244  1.46217074 -3.29686806]
절편:  -0.002358337557026302


In [53]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler 
from sklearn import linear_model

y = mandarine_consume_df2.iloc[:,0]
X = mandarine_consume_df2.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)
from sklearn.linear_model import LinearRegression

model1 = Pipeline([('scaler', StandardScaler()),
                  ('regressor', LinearRegression())])
model2 = Pipeline([('scaler', StandardScaler()),
                  ('poly', PolynomialFeatures(2))])
y1_hat = model1.fit(X_train, y_train)
y2_hat = model2.fit_transform(X_train, y_test)
y1_pred = model1.predict(X_test)
y2_pred = model2.predict(X_test)

print("model1의 r2 score (훈련): ", r2_score(y_train, y1_hat))
print("model1의 r2 score (테스트): ", r2_score(y_test, y1_pred))
print("model2의 r2 score (훈련): ", r2_score(y_train, y2_hat))
print("model2의 r2 score (테스트): ", r2_score(y_test, y2_pred))

AttributeError: 'PolynomialFeatures' object has no attribute 'predict'

In [None]:
from sklearn import linear_model

poly = PolynomialFeatures(degree=3)

X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

print(model.score(X_train, y_train))

model.fit(X_test, y_test)
print(model.score(X_test, y_test))

In [None]:
X