## 클러스터 라벨 예측 모델
- 사용할 알고리즘

- 소프트맥스 함수
    - 로지스틱 회귀분석 

- OVO/OVR Classifier 기법
    - SVM
    - Ada Boost
    - XGboost

In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier , OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

### 자료 전처리 : x- 축제별 키워드 y- 축제별 클러스터 라벨(연령 비율)
- inergration방법 
    - keyword의 축제명 <-> query의 축제명_수정
    - query의 축제번호 <-> ratio의 X

In [11]:
keyword = pd.read_csv("base_tag.csv")
ratio = pd.read_csv("result.csv")
query = pd.read_excel("festival_info_1516_for_query.xlsx")

In [18]:
keyword["축제명"].map({j:i for i,j in zip(query["축제번호"],query["축제명_수정"])})

0     24
1      4
2      8
3      2
4      6
      ..
71    61
72    64
73    76
74    40
75    50
Name: 축제명, Length: 76, dtype: int64

In [22]:
keyword["Unnamed: 0"] =keyword["축제명"].map({j:i for i,j in zip(query["축제번호"],query["축제명_수정"])})

In [30]:
keyword.columns = ["축제번호"]+[i for i in keyword.columns[1:]]

In [32]:
keyword.head()

Unnamed: 0,축제번호,축제명,축제,서울,개최,문화,행사,다양,공연,진행,...,도시,이벤트,전국,주민,한강,작품,대회,주관,어린이,기업
0,24,서울국제빵과자페스티벌,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,4,아르헨티나페스티발,0,0,1,0,1,0,1,1,...,1,0,0,0,0,0,0,0,0,0
2,8,한강홀릭,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2,그랜드케이팝페스티벌,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,6,서리풀페스티벌,1,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [39]:
data = keyword

In [40]:
data["라벨"] = keyword["축제번호"].map({i:j for i,j in zip(ratio["X"],ratio["clt$cluster"])})

ratio가 존재하는 자료는 73개인데 수집한 자료는 76개 이므로 3개를 드랍

In [43]:
data = data.dropna()

완성된 데이터 프레임

In [45]:
data.head()

Unnamed: 0,축제번호,축제명,축제,서울,개최,문화,행사,다양,공연,진행,...,이벤트,전국,주민,한강,작품,대회,주관,어린이,기업,라벨
0,24,서울국제빵과자페스티벌,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,4.0
1,4,아르헨티나페스티발,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,3.0
2,8,한강홀릭,1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,3.0
3,2,그랜드케이팝페스티벌,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5.0
4,6,서리풀페스티벌,1,1,0,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,3.0


## 모델링
### 기본모델구축

In [103]:
log =  LogisticRegression(solver="newton-cg",multi_class="multinomial")
ada =  AdaBoostClassifier()
svm = SVC()
xgb = xgb.
ovo_ada = OneVsOneClassifier(ada) 
ovo_svm = OneVsOneClassifier(svm) 
ovr_ada = OneVsRestClassifier(ada)
ovr_svm = OneVsRestClassifier(svm)

In [107]:
x_dt = data.iloc[:,2:-1]
y_dt = data["라벨"]

In [108]:
x_train, x_test, y_train,y_test = train_test_split(x_dt,y_dt ,test_size=0.2, random_state=42)

In [109]:
x_train.head()

Unnamed: 0,축제,서울,개최,문화,행사,다양,공연,진행,체험,페스티벌,...,도시,이벤트,전국,주민,한강,작품,대회,주관,어린이,기업
22,0,1,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
57,0,1,1,1,0,0,0,0,1,1,...,0,1,0,0,0,0,0,0,0,1
50,1,0,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
33,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
39,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [110]:
y_train.head()

22    4.0
57    5.0
50    5.0
33    4.0
39    3.0
Name: 라벨, dtype: float64

In [102]:
log.fit(x_train,y_train)
ovo_ada.fit(x_train,y_train)
ovo_svm.fit(x_train,y_train)
ovr_ada.fit(x_train,y_train)
ovr_svm.fit(x_train,y_train)



OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma='auto_deprecated',
                                  kernel='rbf', max_iter=-1, probability=False,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=False),
                    n_jobs=None)

In [101]:
print("로지스틱 가중 F1스코어 : "+str(int(f1_score(y_test,log.predict(x_test),average='weighted')*100)/100))
print("OvO Ada 가중 F1스코어 : "+str(int(f1_score(y_test,ovo_ada.predict(x_test),average='weighted')*100)/100))
print("OvO SVM 가중 F1스코어 : "+str(int(f1_score(y_test,ovo_svm.predict(x_test),average='weighted')*100)/100))
print("OvR Ada 가중 F1스코어 : "+str(int(f1_score(y_test,ovr_ada.predict(x_test),average='weighted')*100)/100))
print("OvR SVM 가중 F1스코어 : "+str(int(f1_score(y_test,ovr_svm.predict(x_test),average='weighted')*100)/100))

로지스틱 가중 F1스코어 : 0.31


  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


OvO Ada 가중 F1스코어 : 0.53
OvO SVM 가중 F1스코어 : 0.37
OvR Ada 가중 F1스코어 : 0.5
OvR SVM 가중 F1스코어 : 0.31


Ada부스트가 다른 알고리즘에 비해 점수가 더 좋게 나오므로 아다부스트만 대상으로 하이퍼파라미터 튜닝을 해주기로 함

### 하이퍼 파라미터 튜닝
- n_estimators= [10,20,30,40,50]
- algorithm= ["SAMME","SAMME.R"]

In [174]:
ovo_ada = OneVsOneClassifier(AdaBoostClassifier(algorithm='SAMME.R',n_estimators= 50))

In [175]:
ovo_ada.fit(x_train,y_train)
print("OvO Ada 가중 F1스코어 : "+str(int(f1_score(y_test,ovo_ada.predict(x_test),average='weighted')*100)/100))

OvO Ada 가중 F1스코어 : 0.59


  'recall', 'true', average, warn_for)


최종 트레인 스코어는 0.59.

이 알고리즘으로 2019년의 카테고리를 예측해서 반환

In [178]:
target = pd.read_csv("target_tag.csv")

In [181]:
target.iloc[:,2:]

Unnamed: 0,축제,서울,개최,문화,행사,다양,공연,진행,체험,페스티벌,...,도시,이벤트,전국,주민,한강,작품,대회,주관,어린이,기업
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,1,1,1,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
233,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
234,0,1,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
235,0,1,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [185]:
ovo_ada.predict(target.iloc[:,2:])

array([4., 5., 4., 3., 5., 5., 3., 2., 5., 4., 1., 5., 4., 5., 5., 3., 3.,
       4., 4., 5., 3., 5., 5., 3., 5., 3., 3., 4., 5., 1., 5., 4., 5., 5.,
       4., 3., 3., 3., 4., 4., 5., 3., 4., 1., 4., 3., 4., 3., 5., 3., 3.,
       4., 3., 4., 3., 3., 3., 1., 2., 5., 4., 5., 2., 5., 5., 5., 3., 4.,
       3., 3., 3., 3., 4., 5., 5., 5., 5., 5., 3., 1., 5., 5., 1., 5., 3.,
       5., 1., 5., 2., 3., 5., 3., 5., 4., 5., 2., 5., 3., 1., 3., 1., 2.,
       3., 3., 3., 4., 4., 3., 5., 4., 4., 5., 4., 3., 3., 3., 3., 3., 3.,
       3., 4., 3., 3., 5., 3., 4., 4., 4., 4., 4., 1., 2., 5., 5., 1., 3.,
       3., 5., 5., 3., 3., 1., 2., 4., 3., 3., 3., 4., 3., 5., 5., 1., 1.,
       4., 4., 5., 5., 5., 3., 3., 5., 5., 3., 5., 5., 5., 3., 3., 3., 5.,
       5., 4., 5., 3., 3., 4., 5., 4., 5., 2., 4., 5., 5., 5., 4., 5., 3.,
       4., 5., 5., 3., 4., 5., 1., 5., 1., 1., 5., 5., 1., 5., 4., 4., 4.,
       1., 4., 1., 1., 3., 4., 3., 1., 3., 5., 5., 5., 5., 3., 5., 3., 1.,
       3., 3., 3., 4., 3.

In [186]:
labeled2019 = pd.DataFrame()

In [187]:
labeled2019["축제명"] =target["축제명"]

In [188]:
labeled2019["클러스터 라벨"] = ovo_ada.predict(target.iloc[:,2:])

In [190]:
labeled2019["클러스터 라벨"] = labeled2019["클러스터 라벨"].map(int)

In [191]:
labeled2019

Unnamed: 0,축제명,클러스터 라벨
0,대한민국 산업기술 R&D대전,4
1,서울디자인페스티벌,5
2,서울무용제,4
3,서울지식이음축제,3
4,코리아세일페스타 (Korea Sale FESTA),5
...,...,...
232,서울 아시테지 겨울축제,4
233,서울 살롱 뒤 쇼콜라,5
234,서울영상광고제,1
235,서울영상광고제,1


In [197]:
cluster_summary = ratio.groupby("clt$cluster").agg(np.mean).iloc[:,2:]

In [198]:
labeled2019.to_csv("2019년 축제 클러스터 라벨.csv")

In [200]:
cluster_summary.to_csv("축제 클러스터 라벨 요약 정보.csv")