# 2. 뉴스,주가 데이터 임포트
## 전처리 & 모델 만들기

In [71]:
import pandas as pd
import numpy as np
import re
import pickle
from joblib import dump, load
import datetime
from pandas_datareader import data as pdr
import fix_yahoo_finance as yf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest
from eunjeon import Mecab
mecab = Mecab()
count_vect = CountVectorizer(decode_error="replace")
TfidfTransformer()



# class 정의 => 기능은 뉴스랑 주가데이터 임포트 ~ 전처리데이터 익스포트
# filename : "news_all.csv"
# filepath : "./dataset/"
class data_import_preprocess():
    def get_allnews(self, filepath):
        df = pd.read_csv(filepath, encoding = 'utf-8 sig', engine = 'python')
        df['title+body'] = df.apply(lambda x : '{}|||{}'.format(x['TITLE'], x['BODY']), axis = 1) # title과 body 합쳐서 새로운 칼럼 추가  구분자 : |||
        df['title+body'] = df['title+body'].astype(str)
        df['NOUNS'] = df['title+body'].apply(lambda x : '{}'.format(mecab.nouns(x))) # ''씌워줘야 list 안 원소들이 join해서 붙는다.
        df.rename(columns={'TIME': 'Date'}, inplace = True)
        df = df.groupby('Date').agg({'NOUNS' : lambda x : ''.join(x)})
        return df     


    # 1param - 종목코드 / 2param - startdate / 3param - enddate
    def stock_get(self, keycode, startdate, enddate):
        yf.pdr_override()
        df_stock = pdr.get_data_yahoo(keycode, start = startdate, end = enddate)
        df_stock['day1T'] = df_stock['Close'].shift(-1)
        df_stock['day3T'] = df_stock['Close'].shift(-3)
        df_stock['day5T'] = df_stock['Close'].shift(-5)
        df_stock['day10T'] = df_stock['Close'].shift(-10)
        return df_stock
    
    
    def news_stock_concat(self, news, stock):
        news_stock = news.join(df_stock, how = 'outer')
        news_stock = news_stock.dropna()
        return news_stock


    def making_new_columns(self, df):
        df['Diff1'] = df['day1T'] - df['Close']
        df['Diff3'] = df['day3T'] - df['Close']
        df['Diff5'] = df['day5T'] - df['Close']
        df['Diff10'] = df['day10T'] - df['Close']
        df['Diff1_per'] = round(df['Diff1'] / df['Close'] * 100, 2)
        df['Diff3_per'] = round(df['Diff3'] / df['Close'] * 100, 2)
        df['Diff5_per'] = round(df['Diff5'] / df['Close'] * 100, 2)
        df['Diff10_per'] = round(df['Diff10'] / df['Close'] * 100, 2)
        bins = [-100, 0, 100]
        labels = [-1, 1]
        df['Diff1_clf'] = pd.cut(df["Diff1_per"], bins, labels = labels)
        df['Diff3_clf'] = pd.cut(df["Diff3_per"], bins, labels = labels)
        df['Diff5_clf'] = pd.cut(df["Diff5_per"], bins, labels = labels)
        df['Diff10_clf'] = pd.cut(df["Diff10_per"], bins, labels = labels)
        df.drop(['High', 'Low', 'Adj Close', 'Volume'], axis=1, inplace = True)
        return df
    
    


class stat_modeling():
    def CounterVec_tfidf(self, df, filename, n, k, ratio):
        # DataFrame에서 필요한 칼럼추출 => split
        X = df['NOUNS']
        y = df.loc[:, ['Diff' + n + '_clf']]


        # 벡터화 (카운트벡터 + tfidf)
        vectorizer = TfidfVectorizer()
        X_vec = vectorizer.fit_transform(X)
        
        
        # chi2 변수선택
        selector = SelectKBest(chi2, k = k)
        X_vec_chi2 = selector.fit_transform(X_vec, y)
        
        
        ##### vec모델 chi2모델 합쳐서 저장 #####
        vec_chi2 = {'vectorizer' : vectorizer, 'selector': selector}
        with open(filename + 'vec_feature.bin', 'wb') as f:
            pickle.dump(vec_chi2, f)


        # train / test 나누기
        X_train, X_test, y_train, y_test = train_test_split(X_vec_chi2, y, test_size = ratio)
        return X_train, X_test, y_train, y_test
    
    
    def make_NBmodel(self, X_train_vec, X_test_vec, y_train, y_test, filename):
        MultinomialNB(alpha = 1.0, class_prior = None, fit_prior = True)
        clf_nb = MultinomialNB()
        clf_nb.fit(X_train_vec, y_train)
        with open(filename+'NBmodel.pkl', 'wb') as fid:
            pickle.dump(clf_nb, fid) 
        #return NBmodel_Ver1


    def make_SVCmodel(self, X_train_vec, X_test_vec, y_train, y_test, filename):        
        clf_svc = SVC(probability=True, gamma = 0.1, class_weight = "balanced", kernel = 'rbf', C = 10)
        clf_svc.fit(X_train_vec, y_train)
        with open(filename+'SVCmodel.pkl', 'wb') as fid:
            pickle.dump(clf_svc, fid)
        #return SVCmodel_Ver1

# 클래스 내 메소드 순차적 실행하는 예제코드

In [2]:
# 인스턴스 생성
cls = data_import_preprocess()


df_news = cls.get_allnews('./Dataset/samsung_pre.csv')
df_stock = cls.stock_get("005930.KS", "2017-12-01", "2019-01-08")
df_news_stock = cls.news_stock_concat(df_news, df_stock)
df_news_stock_new = cls.making_new_columns(df_news_stock)
#df_news_stock_new.head()

[*********************100%***********************]  1 of 1 downloaded


In [72]:
# 이전 모듈 결과값 : df_news_stock_new
cls = data_import_preprocess()
stat = stat_modeling()

X_train, X_test, y_train, y_test = stat.CounterVec_tfidf(df_news_stock_new, '삼성전자', n = '3', k = 15000, ratio = 0.2)

stat.make_NBmodel(X_train, X_test, y_train, y_test, '삼성전자') # 파라미터는 기본세팅, 바꾸려면 모듈 고치센
stat.make_SVCmodel(X_train, X_test, y_train, y_test, '삼성전자') # 파라미터는 기본세팅, 바꾸려면 모듈 고치센

  y = column_or_1d(y, warn=True)


# 최종결과물 확인
- 3만 넘으면 Ok

In [73]:
len(df_news_stock_new)
len(df_news_stock_new.iloc[0,0])

37112

# 3-1. 통계분류로 예측(SVC, NaiveBaysian)
## Vectorization => countervec + tf/idf로 학습

In [74]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from gensim.models.doc2vec import Doc2Vec
import pickle
count_vect = CountVectorizer()
TfidfTransformer()

# 파이프라인 쓰는법도 고려해보자!!!! 더 쉽고 간결함
# 시각화 함수도 넣으면 좋을듯?         
        
# test만 쓰는데 이 부분을 크롤링 한 뉴스데티어 전처리한 파일 들어가도록 세팅!!!
class classification():     
    def NB_tfidf(self, X_train_vec, X_test_vec, y_train, y_test, filename):
        #clf_nb = load('NBmodel_Ver1') 
        with open(filename+'NBmodel.pkl', 'rb') as fid:
            clf_nb = pickle.load(fid)
        
        #predicted = clf_tfidf.predict(X_test_vec)
        #print('tfidf로 변환한 예측률 :',np.mean(predicted == y_test))
        
        print('훈련 셋 검증결과 : {:.3f}.'.format(clf_nb.score(X_train, y_train)))
        print('테스트 셋 검증결과 : {:.3f}.'.format(clf_nb.score(X_test, y_test)))
        #print('UP/Down 예측값 : {:.3f}.'.format(clf_nb.predict_proba(X_test)))  만드는 중
        print(clf_nb.predict(X_train))
        return clf_nb
        

    # [과제]gamma는 0.1 / c는 10 디폴트로 설정해보자
    # 1param - Xdata / 2param - ydata / 3param - ratio / 4param - N일 뒤 예측(1,3,5,10)
    def SVC_tfidf(self, X_train_vec, X_test_vec, y_train, y_test, filename):  # ratio - 0.2 / gamma - 0.1 / c - 10 디폴트하거나 파라미터 삭제
        #clf_svc = load('SVCmodel_Ver1') 
        with open(filename+'SVCmodel.pkl', 'rb') as fid:
            clf_svc = pickle.load(fid)
            
        print('훈련 셋 검증결과 : {:.3f}.'.format(clf_svc.score(X_train_vec, y_train)))
        print('테스트 셋 검증결과 : {:.3f}.'.format(clf_svc.score(X_test_vec, y_test)))
        #print(clf_svc.predict_proba(X_test))
        print(clf_svc.predict(X_train))
    
    
        # 퍼센트 안내멘트를 리턴하면 될듯?
        return clf_svc

## 테스트 실행1 - countVec + Naive Baysian 분류기
- NB는 왠지 피팅만 안되도록 벡터수를 조절하면 예측률이 좀 높아지지 않을까?
- 한글설명 : https://datascienceschool.net/view-notebook/c19b48e3c7b048668f2bb0a113bd25f7/

In [75]:
# 이전 모듈 결과값 : df_news_stock_new
clf = classification()

nb = clf.NB_tfidf(X_train, X_test, y_train, y_test, '삼성전자')

훈련 셋 검증결과 : 0.594.
테스트 셋 검증결과 : 0.569.
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]


## 테스트 실행2 - countVec + Kernel SVM 분류기
- 한글설명 : https://tensorflow.blog/파이썬-머신러닝/2-3-7-커널-서포트-벡터-머신/

In [76]:
clf = classification()

svc = clf.SVC_tfidf(X_train, X_test, y_train, y_test, '삼성전자')

훈련 셋 검증결과 : 0.975.
테스트 셋 검증결과 : 0.725.
[ 1 -1 -1 -1 -1  1  1 -1 -1 -1  1 -1  1  1 -1  1  1 -1 -1  1  1  1 -1 -1
 -1  1  1  1 -1  1  1 -1  1 -1 -1  1 -1 -1  1 -1 -1 -1 -1 -1  1  1  1 -1
  1  1 -1 -1 -1  1 -1  1 -1 -1  1  1 -1  1 -1  1  1 -1  1  1 -1 -1 -1 -1
 -1 -1  1  1  1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1 -1 -1  1 -1  1
  1  1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1 -1 -1  1  1  1  1 -1 -1  1 -1
 -1  1 -1 -1  1 -1 -1  1 -1 -1 -1 -1  1 -1  1 -1  1 -1  1  1 -1 -1 -1  1
  1 -1 -1 -1  1  1 -1 -1  1  1  1  1  1 -1  1 -1  1  1 -1  1 -1 -1 -1 -1
  1 -1 -1  1  1  1  1 -1 -1  1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1 -1  1
  1  1 -1 -1 -1 -1 -1  1  1  1]


In [8]:
X_test

<51x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 32309 stored elements in Compressed Sparse Row format>

# N일치 뉴스로 분류해보기
- tfidf 에러 : https://stackoverflow.com/questions/44855780/idf-vector-is-not-fitted-error-when-using-a-saved-classifier-model?rq=1
- 피클 에러: http://thiagomarzagao.com/2015/12/08/saving-TfidfVectorizer-without-pickles/

In [9]:
# 인스턴스 생성
cls = data_import_preprocess()

X = df_news_stock_new['NOUNS']
X_new = cls.news_get('./small_news/삼성전자.csv')
X_new = X_new.groupby('Date').agg({'NOUNS' : lambda x : ''.join(x)})
X_new2 = X_new['NOUNS']
X_new2

### SVC분류모델 임포트

In [87]:
# 분류모델과 전처리모델 불러와 저장
with open('삼성전자SVCmodel.pkl', 'rb') as f:
    clf_svc = pickle.load(f)
with open('삼성전자vec_feature.bin', 'rb') as f:
    vec_feature = pickle.load(f)

# 전처리 피클파일에서 모델 불러와 저장
vectorizer = vec_feature.get('vectorizer')
selector = vec_feature.get('selector')

### 기존 tf/idf모델, Feature Selection모델 임포트
- import : https://stackoverflow.com/questions/33497314/sklearn-dumping-model-using-joblib-dumps-multiple-files-which-one-is-the-corre
- dict import : 
- https://stackoverflow.com/questions/32764991/how-do-i-store-a-tfidfvectorizer-for-future-use-in-scikit-learn

In [81]:
X_new2_vec = vectorizer.transform(X_new2)
X_new2_chi2 = selector.transform(X_new2_vec)
predicted = clf_svc.predict(X_new2_chi2)
predicted_probability = clf_svc.predict_proba(X_new2_chi2)

<4x30648 sparse matrix of type '<class 'numpy.float64'>'
	with 1572 stored elements in Compressed Sparse Row format>

In [85]:
print(predicted)
print(predicted_probability)

array([ 1, -1,  1, -1], dtype=int64)

# 새로운거 시도하는 중

In [None]:
training_indices, validation_indices = training_indices, testing_indices = train_test_split(df.index, stratify = df_class, train_size=0.8, test_size=0.2)

In [None]:
from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot = TPOTClassifier(generations=5,verbosity=2)

tpot.fit(df.drop('Diff1_clf', axis=1).loc[training_indices].values,
         df.loc[training_indices, 'Diff1_clf'].values)

In [None]:
df.dtypes

In [None]:
df_class = df_news_stock_new['Diff1_clf'].values

In [None]:
df = df_news_stock_new[['NOUNS', 'Diff1_clf']]

In [None]:
df['NOUNS'] = df['NOUNS'].astype('str')

In [None]:
df_news_stock_new.drop(df_news_stock_new.columns['Open', 'Close', 'day1T', 'day3T', 'day5T', 'day10T', 'Diff1',
       'Diff3', 'Diff5', 'Diff10', 'Diff1_per', 'Diff3_per', 'Diff5_per',
       'Diff10_per'], axis=1, inplace=True)

In [None]:
df_news_stock_new.drop(['Open', 'Close', 'day1T', 'day3T', 'day5T', 'day10T', 'Diff1',
       'Diff3', 'Diff5', 'Diff10', 'Diff1_per', 'Diff3_per', 'Diff5_per',
       'Diff10_per'], axis=1, inplace=True)

In [None]:
del df_news_stock_new['Date']

# 3-2. 통계분류로 예측(svc, NaiveBaysian)
## Vectorization => Doc2Vec으로 학습

## 테스트 실행3 - Doc2Vec + Kernel SVM 분류기

In [None]:
vec = Vector()
clf = classification()

# 모델 임포트하는데 시간 좀 걸립니다. ㅠ / 모델 합치면 6G정도, 웬만한 노트북은 디스크 긁으면서 돌아갑니다.
model = vec.Doc_model_import('./Doc2VecClf/doc2vec_no_pos_tagging.model')

# 함수에 모델 넣는것도 오래걸린다 ㅠ
X_train, X_test, y_train, y_test = vec.Doc_vector(model, df_news_stock_new, n = '5', ratio = 0.2)

In [None]:
X_train, X_test, y_train, y_test = vec.Doc_vector(model, df_news_stock_new, n = '10', ratio = 0.2)

In [None]:
vec = Vector()
clf = classification()

svc = clf.SVC_tfidf(X_train, X_test, y_train, y_test)

In [None]:
!pip show gensim

## 테스트 실행4 - Doc2Vec + Naive Baysian 분류기
- 코드는 이상 없는데, doc2vec에서 벡터를 추출하면 음수값이 섞여있으서 분류기 학습이 안된다.ㅠ

In [None]:
vec = Vector()
clf = classification()

svc = clf.NB_tfidf(X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
#count_vect = CountVectorizer()


# 파이프라인 쓰는법도 고려해보자!!!! 더 쉽고 간결함
# 시각화 함수도 넣으면 좋읗듯? 
class VectorFeature():
    
    def split_vectorize(self, df, n, ratio):
        # DataFrame에서 필요한 칼럼추출 => split
        X = df['NOUNS']
        y = df.loc[:, ['Diff'+n+'_clf']]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = ratio)
        
        
        # 벡터화+tf/idf 해주는 pipeline 설계
        makevector = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
        ])
        
                
        # 벡터화 진행 후 저장(X_data만  실시)
        X_train_vec = makevector(X_train)
        X_test_vec = makevector(X_test)
        return X_train_vec, X_test_vec, y_train, y_test
    
    
    # k =15000정도 괜춘
    def feature_Select(self, X_train_vec, X_test_vec, y_train, y_test, k):
        # X벡터를 y값과 비교해 카이제곱검정으로 벡터 축소
        X_train_chi2 = SelectKBest(chi2, k = k).fit_transform(X_train_vec, y_train)
        X_test_chi2 = SelectKBest(chi2, k = k).fit_transform(X_test_vec, y_test)
        print(X_train_chi2.shape)
        print(X_test_chi2.shape)
        return X_train_chi2, X_test_chi2
    
    
''' 위 함수에 메소드 오버라이딩해보면 좋을 듯?
    super().feature_Select(X_train_vec, y,  k)   이렇게...?


    # k =15000정도 괜춘
    def feature_Select(self, X_train_vec, X_test_vec, y_train, y_test, k):
        # X벡터를 y값과 비교해 카이제곱검정으로 벡터 축소
        X_train_chi2 = SelectKBest(chi2, k = k).fit_transform(X_train_vec, y_train)
        X_test_chi2 = SelectKBest(chi2, k = k).fit_transform(X_test_vec, y_test)
        print(X_train_chi2.shape)
        print(X_test_chi2.shape)
        return X_train_chi2, X_test_chi2
'''
    
class classification(VectorFeature): 
    
    def NB_tfidf(self, X_train_vec, X_test_vec, y_train, y_test):
        MultinomialNB(alpha = 1.0, class_prior = None, fit_prior = True)
        clf_nb = MultinomialNB()
        clf_nb.fit(X_train_vec, y_train)
        
        #predicted = clf_tfidf.predict(X_test_vec)
        #print('tfidf로 변환한 예측률 :',np.mean(predicted == y_test))
        
        print('훈련 셋 검증결과 : {:.3f}.'.format(clf_nb.score(X_train, y_train)))
        print('테스트 셋 검증결과 : {:.3f}.'.format(clf_nb.score(X_test, y_test)))
        #print('UP/Down 예측값 : {:.3f}.'.format(clf_nb.predict_proba(X_test)))  만드는 중
        print(clf_nb.predict(X_train))
        

    # [과제]gamma는 0.1 / c는 10 디폴트로 설정해보자
    # 1param - Xdata / 2param - ydata / 3param - ratio / 4param - N일 뒤 예측(1,3,5,10)
    def SVC_tfidf(self, X_train_vec, X_test_vec, y_train, y_test):  # ratio - 0.2 / gamma - 0.1 / c - 10 디폴트하거나 파라미터 삭제
        clf_svc = SVC(gamma = 0.1, class_weight = "balanced", kernel = 'rbf', C = 10)
        clf_svc.fit(X_train_vec, y_train)
        
        print('훈련 셋 검증결과 : {:.3f}.'.format(clf_svc.score(X_train_vec, y_train)))
        print('테스트 셋 검증결과 : {:.3f}.'.format(clf_svc.score(X_test_vec, y_test)))
        #print(clf_svc.predict_proba(X_test))
        print(clf_svc.predict(X_train))
    
    
        # 퍼센트 안내멘트를 리턴하면 될듯?

In [None]:
    # param1 - data / param2 - testSet 비율
    # X_train과 X_test만 벡터화시키면 된다. y는 그냥 칼럼 슬라이싱만 하면 ok(아직은 y값 2진분류)
    def split_vectorize(df, ratio):        
        # Train data 벡터 + tf/idf
        X_train_counts = count_vect.fit_transform(X) 
        tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) 
        X_train_tfidf = tfidf_transformer.transform(X_train_counts)
        
        # Test data 벡터 + tf/idf
        X_test_counts = count_vect.transform(X)
        X_test_tfidf = tfidf_transformer.transform(X_test_counts)

        return (X_train_tfidf, X_test_tfidf, y_train, y_test)
        
    # 아직 y슬라이싱 안하고 집어넣음 =-> 수정 보완 ㄱㄱㄱㄱㄱㄱㄱㄱㄱㄱ 
    def NB_tfidf(df):
        clf_train = MultinomialNB().fit(X_train_tfidf, y_train)
        predicted_train = clf_tfidf.predict(X_train_tfidf)
        print('======== Train데이터 예측률 ========')
        print('tfidf로 변환한 예측률 :', np.mean(predicted_train == y_train))
        
        clf_test = MultinomialNB().fit(X_test_tfidf, y_test)
        print('======== Test데이터 예측률 ========')
        print('tfidf로 변환한 예측률 :', np.mean(clf_test == y_test))
        return 뭘 뽑아야 할까요`~?