In [1]:
from requests_html import HTMLSession
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [117]:
Corpus = pd.read_csv (r'all-data.csv', encoding='latin-1')

In [118]:
Corpus.columns = ['status', 'text']

In [119]:
Corpus.tail()

Unnamed: 0,status,text
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...
4844,negative,Sales in Finland decreased by 10.5 % in Januar...


In [120]:
Corpus = Corpus.dropna()

In [121]:
np.random.seed(500)

In [122]:
Corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4845 entries, 0 to 4844
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   status  4845 non-null   object
 1   text    4845 non-null   object
dtypes: object(2)
memory usage: 113.6+ KB


In [123]:
# dataset 전처리 : inplace=true로 함으로써 Corpus 자체에 저장
Corpus['text'].dropna(inplace=True)
# dataset 전처리 : text를 소문자로 변환
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

In [125]:
# dataset 전처리 : 각 텍스트 행에 대해 단어, 구, 등 의미있는 단위로 분할(토큰화)
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

In [129]:
# lemmenting/steming(문맥에 따른 어근분류)을 위한 전 과정 - wordnet(wn)에 명사,
# 형용사, 동사, 부사 태그 추가
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [130]:
# 자연어처리 : text 열의 각 행에 대한 토큰을 문맥에 따라 어근으로 분류하고 
# text_final열에 추가
for index,entry in enumerate(Corpus['text']):
    # 저장될 리스트 생성
    Final_words = []
    # WordNetLemmatizer() 초기화
    word_Lemmatized = WordNetLemmatizer()
    # nltk의 pos_tag함수를 통해 각각의 토큰에 대한 태그를 출력
    #(태그=명사(N), 형용사(J), 동사(V), 부사(R))  
    for word, tag in pos_tag(entry):
        # 각각의 토큰이 stop word(a, the, is, are 등)이고 문자이면 리스트에 추가
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # Corpus의 새로운 열 text_final에 생성된 리스트 추가(태그에 해당하면 추가)
    Corpus.loc[index,'text_final'] = str(Final_words)

In [132]:
Corpus = Corpus.drop(Corpus[Corpus.status == 'neutral'].index)

In [133]:
Corpus.tail()

Unnamed: 0,status,text,text_final
4839,negative,"[helsinki, thomson, financial, -, shares, in, ...","['helsinki', 'thomson', 'financial', 'share', ..."
4840,negative,"[london, marketwatch, --, share, prices, ended...","['london', 'marketwatch', 'share', 'price', 'e..."
4842,negative,"[operating, profit, fell, to, eur, 35.4, mn, f...","['operating', 'profit', 'fell', 'eur', 'mn', '..."
4843,negative,"[net, sales, of, the, paper, segment, decrease...","['net', 'sale', 'paper', 'segment', 'decrease'..."
4844,negative,"[sales, in, finland, decreased, by, 10.5, %, i...","['sale', 'finland', 'decrease', 'january', 'sa..."


In [134]:
Corpus.reset_index(drop=True)

Unnamed: 0,status,text,text_final
0,negative,"[the, international, electronic, industry, com...","['international', 'electronic', 'industry', 'c..."
1,positive,"[with, the, new, production, plant, the, compa...","['new', 'production', 'plant', 'company', 'wou..."
2,positive,"[according, to, the, company, 's, updated, str...","['accord', 'company', 'updated', 'strategy', '..."
3,positive,"[financing, of, aspocomp, 's, growth, aspocomp...","['financing', 'aspocomp', 'growth', 'aspocomp'..."
4,positive,"[for, the, last, quarter, of, 2010, ,, compone...","['last', 'quarter', 'componenta', 'net', 'sale..."
...,...,...,...
1962,negative,"[helsinki, thomson, financial, -, shares, in, ...","['helsinki', 'thomson', 'financial', 'share', ..."
1963,negative,"[london, marketwatch, --, share, prices, ended...","['london', 'marketwatch', 'share', 'price', 'e..."
1964,negative,"[operating, profit, fell, to, eur, 35.4, mn, f...","['operating', 'profit', 'fell', 'eur', 'mn', '..."
1965,negative,"[net, sales, of, the, paper, segment, decrease...","['net', 'sale', 'paper', 'segment', 'decrease'..."


In [135]:
# 30퍼센트는 테스트 데이터, 나머지 70퍼센트는 트레이닝 데이터로 데이터셋 분류
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['status'],test_size=0.3)

In [136]:
# 각 테스트 데이터에 대한 y값(현재는 label1 또는 label2)
Train_Y.head()

4795    negative
91      positive
1731    positive
1764    positive
4573    negative
Name: status, dtype: object

In [137]:
# 각 테스트 데이터에 대한 x값(현재는 문자)
Train_X.head()

4795    ['sale', 'unit', 'slump', 'last', 'year', 'ind...
91      ['february', 'finnish', 'broadband', 'data', '...
1731    ['developer', 'project', 'predict', 'complete'...
1764    ['new', 'policy', 'also', 'aim', 'make', 'comp...
4573    ['result', 'also', 'burden', 'increased', 'fix...
Name: text_final, dtype: object

In [138]:
# y값을 0또는 1로 인코딩 (binary classification)
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [139]:
Train_Y

array([0, 1, 1, ..., 0, 1, 1])

In [140]:
# TF-IDF(Term Frequency — Inverse Document Frequency)에 
# text_final에 저장된 자연어처리된 단어들을 학습시킨다.
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [141]:
# Tfidf_vect가 Corps의 text_final을 통해 학습한 단어
print(Tfidf_vect.vocabulary_)



In [142]:
# Train_X_Tfidf의 (행 인덱스, 고유숫자) TF-IDF에 의해 계산된 값
# (해당 행에서의 해당 단어 빈도(TF) * log(해당단어가 존재하는 행 수/전체 Corpus의 행수)
print(Train_X_Tfidf)

  (0, 3882)	0.15032775421199263
  (0, 3851)	0.35415951551033986
  (0, 3685)	0.2340443080737865
  (0, 3200)	0.3733414874572969
  (0, 3188)	0.3733414874572969
  (0, 2981)	0.1382267577611237
  (0, 2883)	0.35415951551033986
  (0, 2583)	0.32136771787663004
  (0, 2009)	0.28478921469798174
  (0, 1855)	0.2140395577566651
  (0, 1594)	0.23530594454305345
  (0, 1491)	0.297201291249725
  (1, 3496)	0.2999617910200092
  (1, 3428)	0.22825278025440984
  (1, 3015)	0.322252285440226
  (1, 2728)	0.17612235750946675
  (1, 2702)	0.26151177007894966
  (1, 2671)	0.1375692720329122
  (1, 2504)	0.16480288516621236
  (1, 2426)	0.18133473863586042
  (1, 2249)	0.1495527932410069
  (1, 1855)	0.22825278025440984
  (1, 1724)	0.3121284342897002
  (1, 1474)	0.21395712603782366
  (1, 1220)	0.14653730445335814
  :	:
  (1372, 966)	0.3843136826602033
  (1373, 3882)	0.16710815251388816
  (1373, 3732)	0.2865743167167667
  (1373, 2687)	0.32536420405856203
  (1373, 1661)	0.25880356634203794
  (1373, 1371)	0.2743840084487467
 

In [143]:
# c-SVC의 c값은 1, kernel은 linear (c값이 커지면 오차 허용치 감소됨 -> OVER FIT, 작아지면 오차 허용치 증가 -> UNDER FIT)
# degree는 필요없음(poly 커널일 경우 최고차항 설정 파라미터)
# gamma는 필요없음(‘rbf’, ‘poly’ and ‘sigmoid’ 커널의 경우 감마값이 클수록 곡선의 굴절률 상승)
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X_Tfidf,Train_Y)

SVC(kernel='linear')

In [144]:
# test 데이터셋에 대한 c-svc의 예측
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [145]:
# test 데이터셋에 대한 c-svc의 예측 정확도
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  83.92554991539764


In [151]:
headline = pd.read_csv (r'1109news_headline.csv', encoding='latin-1')

In [152]:
headline.head()

Unnamed: 0.1,Unnamed: 0,news_title
0,0,"""Ethereum when:1d"" - Google News"
1,1,Bitcoin and Ethereum prices jump to new record...
2,2,Crypto Price Prediction: Bitcoin Could Lose It...
3,3,"Bitcoin, Ether prices hit new all-time records..."
4,4,Discord CEO Hints at Ethereum Compatibility - ...


In [153]:
# dataset 전처리 : inplace=true로 함으로써 Corpus 자체에 저장
headline['news_title'].dropna(inplace=True)
# dataset 전처리 : text를 소문자로 변환
headline['news_title'] = [entry.lower() for entry in headline['news_title']]
# dataset 전처리 : 각 텍스트 행에 대해 단어, 구, 등 의미있는 단위로 분할(토큰화)
headline['news_title']= [word_tokenize(entry) for entry in headline['news_title']]

In [154]:
headline.head()

Unnamed: 0.1,Unnamed: 0,news_title
0,0,"[``, ethereum, when:1d, '', -, google, news]"
1,1,"[bitcoin, and, ethereum, prices, jump, to, new..."
2,2,"[crypto, price, prediction, :, bitcoin, could,..."
3,3,"[bitcoin, ,, ether, prices, hit, new, all-time..."
4,4,"[discord, ceo, hints, at, ethereum, compatibil..."


In [155]:
# lemmenting/steming(문맥에 따른 어근분류)을 위한 전 과정 - wordnet(wn)에 명사,
# 형용사, 동사, 부사 태그 추가
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
# 자연어처리 : text 열의 각 행에 대한 토큰을 문맥에 따라 어근으로 분류하고 
# text_final열에 추가
for index,entry in enumerate(headline['news_title']):
    # 저장될 리스트 생성
    Final_words = []
    # WordNetLemmatizer() 초기화
    word_Lemmatized = WordNetLemmatizer()
    # nltk의 pos_tag함수를 통해 각각의 토큰에 대한 태그를 출력
    #(태그=명사(N), 형용사(J), 동사(V), 부사(R))  
    for word, tag in pos_tag(entry):
        # 각각의 토큰이 stop word(a, the, is, are 등)이고 문자이면 리스트에 추가
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # Corpus의 새로운 열 text_final에 생성된 리스트 추가(태그에 해당하면 추가)
    headline.loc[index,'text_final'] = str(Final_words)

In [156]:
headline.head()

Unnamed: 0.1,Unnamed: 0,news_title,text_final
0,0,"[``, ethereum, when:1d, '', -, google, news]","['ethereum', 'google', 'news']"
1,1,"[bitcoin, and, ethereum, prices, jump, to, new...","['bitcoin', 'ethereum', 'price', 'jump', 'new'..."
2,2,"[crypto, price, prediction, :, bitcoin, could,...","['crypto', 'price', 'prediction', 'bitcoin', '..."
3,3,"[bitcoin, ,, ether, prices, hit, new, all-time...","['bitcoin', 'ether', 'price', 'hit', 'new', 'r..."
4,4,"[discord, ceo, hints, at, ethereum, compatibil...","['discord', 'ceo', 'hint', 'ethereum', 'compat..."


In [159]:
target_x = headline['text_final']

In [160]:
target_x

0                        ['ethereum', 'google', 'news']
1     ['bitcoin', 'ethereum', 'price', 'jump', 'new'...
2     ['crypto', 'price', 'prediction', 'bitcoin', '...
3     ['bitcoin', 'ether', 'price', 'hit', 'new', 'r...
4     ['discord', 'ceo', 'hint', 'ethereum', 'compat...
                            ...                        
95    ['discord', 'may', 'soon', 'add', 'ethereum', ...
96              ['ethereum', 'supply', 'could', 'drop']
97    ['barry', 'sternlicht', 'invests', 'bitcoin', ...
98    ['ckb', 'nervos', 'fly', 'compatibility', 'eth...
99      ['first', 'bitcoin', 'etf', 'ethereum', 'next']
Name: text_final, Length: 100, dtype: object

In [161]:
# TF-IDF(Term Frequency — Inverse Document Frequency)에 
# text_final에 저장된 자연어처리된 단어들을 학습시킨다.
Tfidf_vect_target = TfidfVectorizer(max_features=5000)
Tfidf_vect_target.fit(headline['text_final'])

Target_X_Tfidf = Tfidf_vect.transform(target_x)

In [162]:
# Tfidf_vect가 Corps의 text_final을 통해 학습한 단어
print(Tfidf_vect_target.vocabulary_)

{'ethereum': 160, 'google': 211, 'news': 311, 'bitcoin': 46, 'price': 344, 'jump': 244, 'new': 310, 'record': 354, 'cnn': 90, 'crypto': 110, 'prediction': 342, 'could': 106, 'lose': 264, 'crown': 109, 'even': 163, 'sudden': 417, 'surge': 421, 'toward': 442, 'forbes': 187, 'ether': 159, 'hit': 216, 'reason': 352, 'euronews': 161, 'discord': 136, 'ceo': 77, 'hint': 215, 'compatibility': 101, 'coindesk': 94, 'killer': 246, 'metaverse': 285, 'token': 438, 'see': 376, 'next': 315, 'millionaire': 288, 'maker': 274, 'nasdaq': 303, 'name': 302, 'already': 11, 'fully': 192, 'dilute': 135, 'cap': 71, 'cointelegraph': 97, 'defi': 130, 'gain': 198, 'chase': 81, 'high': 214, 'dogecoin': 139, 'monday': 292, 'motley': 296, 'fool': 186, 'classic': 85, 'shoot': 386, 'today': 436, 'benzinga': 39, 'facebook': 173, 'go': 210, 'meta': 284, 'competitor': 102, 'soar': 395, 'seven': 382, 'day': 124, 'reach': 350, 'market': 275, 'daily': 121, 'hodl': 217, 'neon': 307, 'lab': 250, 'developer': 134, 'evm': 167, 

In [163]:
# Train_X_Tfidf의 (행 인덱스, 고유숫자) TF-IDF에 의해 계산된 값
# (해당 행에서의 해당 단어 빈도(TF) * log(해당단어가 존재하는 행 수/전체 Corpus의 행수)
print(Target_X_Tfidf)

  (0, 2265)	0.5598722901943177
  (0, 1377)	0.8285789151749937
  (1, 2796)	0.5346154585879205
  (1, 2637)	0.45494416823042466
  (1, 2259)	0.3962115208245034
  (1, 1724)	0.5918011034105028
  (2, 3584)	0.4379303614248742
  (2, 3395)	0.3544645247805835
  (2, 2637)	0.26393223567231444
  (2, 1967)	0.3544645247805835
  (2, 1093)	0.3769650188049277
  (2, 772)	0.41542986740053
  (2, 750)	0.41542986740053
  (3, 2796)	0.4331108737114239
  (3, 2779)	0.5801241668676873
  (3, 2637)	0.36856634619702405
  (3, 2259)	0.320984953207468
  (3, 1491)	0.4868248456625583
  (4, 517)	1.0
  (5, 3060)	0.48228570256331155
  (5, 2270)	0.508397844296958
  (5, 2222)	0.5964370836967795
  (5, 2012)	0.3914012496256238
  (6, 2259)	0.26748682941054847
  (6, 2216)	0.4648579475946764
  :	:
  (92, 3474)	0.5400440534570586
  (92, 3333)	0.5545386030398953
  (92, 3129)	0.5078096089024533
  (92, 2637)	0.37811209868313955
  (93, 3562)	0.5832033210741981
  (93, 2637)	0.5207359169044515
  (93, 450)	0.6234645067152625
  (94, 1659)	0

In [164]:
target_predictions_SVM = SVM.predict(Target_X_Tfidf)

In [165]:
print(target_predictions_SVM)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1]


In [166]:
export_dataset = pd.read_csv (r'1109news_headline.csv', encoding='latin-1')

In [168]:
export_dataset['predicted_status'] = target_predictions_SVM

In [170]:
export_dataset

Unnamed: 0.1,Unnamed: 0,news_title,predicted_status
0,0,"""Ethereum when:1d"" - Google News",1
1,1,Bitcoin and Ethereum prices jump to new record...,1
2,2,Crypto Price Prediction: Bitcoin Could Lose It...,1
3,3,"Bitcoin, Ether prices hit new all-time records...",1
4,4,Discord CEO Hints at Ethereum Compatibility - ...,1
...,...,...,...
95,95,"Discord May Soon Add Ethereum (ETH) Support, T...",1
96,96,Ethereum 2.0: supply could drop by 1% - D1Soft...,0
97,97,Barry Sternlicht invests in Bitcoin and Ethere...,1
98,98,$ CKB of Nervos flies | Compatibility with the...,1
