In [31]:
import spacy
import pandas as pd
import numpy as np
import math
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
import datetime as dt
from sklearn.linear_model import LinearRegression

In [32]:
articles = pd.read_csv('spiders/data/news/US_T_ALL.csv')

In [33]:
# 함수 정의 영역
# 불용어 처리
def clean_text(corpus):
    # Remove punctuations from the corpus
    translator = str.maketrans('', '', string.punctuation)
    corpus = corpus.translate(translator)
    corpus = corpus.replace('   ', '')
    corpus = corpus.replace('  ', ' ')
    
    # Remove digits from the corpus
    remove_digits = str.maketrans('', '', string.digits)
    corpus = corpus.translate(remove_digits)
    return corpus

# TFIDF 계산, begin과 end는 n-gram을 위한 파라미터
#예) 2,2는 두 글자 단어만 확인, 1, 2는 1~2글자 단어를 확인, 2, 5는 두 개~다섯 개 단어를 확인.
def make_tfidf_df(sent_lst, begin, end):
    TFIDFmodel = TfidfVectorizer(ngram_range=(begin, end), binary = False)
    TFIDFmodel = TFIDFmodel.fit(sent_lst)
    return pd.DataFrame(TFIDFmodel.transform(sent_lst).toarray(),
             columns = [it[0] for it in sorted(TFIDFmodel.vocabulary_.items(), key=lambda x: x[1])])

In [34]:
nlp = spacy.load("en_core_web_md")
spacy_stopwords = nlp.Defaults.stop_words

In [74]:
articles = articles.loc[articles.symbol=='AAPL'].reset_index(drop=True)

In [72]:
df = pd.DataFrame(columns=['DATE', 'TEXT'])
text = ""
for i in articles.index:
    time = articles['create_dttm'][i]
    time = time[:8]

    data_to_insert = {'DATE': time, 'TEXT': ""}
    df = df.append(data_to_insert, ignore_index=True)

df = df.drop_duplicates(ignore_index=True)
df = df.sort_values(by='DATE', ignore_index=True)

In [76]:
articles.loc[articles.symbol=='AAPL',['date','contents']]

Unnamed: 0,date,contents
0,20220103,Two iPhone 13s are shown at different angles G...
1,20220103,Investment company Rempart Asset Management In...
2,20220104,BlackBerry devices running the original operat...
3,20220104,"Stocks in focus in premarket trade on Tuesday,..."
4,20220104,Apple stock (NASDAQ:AAPL) hit the $3 trillion ...
5,20220104,This article first appeared in the Morning Bri...
6,20220104,Apple Inc supplier Foxconn is unlikely to reop...
7,20220104,It took a jury seven full days to conclude tha...
8,20220104,Apple Inc's (NASDAQ:AAPL) App Store revenue gr...
9,20220104,"Delivery times for chips rose in December, sig..."


In [77]:
df

Unnamed: 0,DATE,TEXT
0,20220103,
1,20220104,
2,20220105,
3,20220106,
4,20220107,
5,20220110,
6,20220111,


In [78]:
print(articles.index)

RangeIndex(start=0, stop=49, step=1)


In [90]:
# 뉴스 생성 날짜 리스트 생성

wlem = WordNetLemmatizer()

for i in articles.index:
    if articles['symbol'][i] == 'AAPL':
        text = articles['contents'][i]
        text = text.lower()
        text_without_stopword = [word for word in text.split() if word not in spacy_stopwords]

        ctext = clean_text(" ".join(text_without_stopword))
        e_time = articles['create_dttm'][i]
        e_time = datetime.strptime(e_time[:8], '%Y%m%d')
        b_time = e_time-dt.timedelta(days=7)
        e_time = e_time.strftime("%Y%m%d")
        b_time = b_time.strftime("%Y%m%d")
    
        doc = nlp(ctext)
        clean_words = ""
        for token in doc:
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                new_word = wlem.lemmatize(token.text)
                clean_words = clean_words+' '+new_word
                
        for d in df.index:
            if (df['DATE'][d] >= b_time) and (df['DATE'][d] <= e_time):
                #print(i, df['DATE'][d])
                text = df['TEXT'][d]
                text = text + clean_words
                df['TEXT'][d] = text

df.dropna(inplace=True)

abs_tfidf_df = make_tfidf_df(df['TEXT'], 1, 1) 

abs_tfidf_df = abs_tfidf_df.transpose()	#행 열 전환
#abs_tfidf_df.rename(columns=abs_tfidf_df.iloc[0], inplace=True)	# 행열이 전환된 데이터프레임의 열 이름 제대로 수정
#abs_tfidf_df = abs_tfidf_df.drop(abs_tfidf_df.index[0])

abs_tfidf_df.to_csv('spiders/data/US_AAPL_'+str(datetime.today().strftime("%y%m%d"))+'_tfidf.csv')
print("파일 저장 완료")

파일 저장 완료


In [94]:
#추세 계산

word_num = len(abs_tfidf_df)
week_num = abs_tfidf_df.shape[1]
dur = 2

x = pd.DataFrame(list(range(0,2)))
x = x.transpose().iloc[0]

results = []
acc_res=[]

for i in range(word_num):
    tmp = []
    tmp.append(abs_tfidf_df.index[i])
    for j in range((week_num%dur),(week_num-dur)):
    #for j in range((week_num-dur)-28,(week_num-dur)):
        y = abs_tfidf_df.iloc[i, j:(j+dur)]
        #plt.plot(x,y,'o')
        #plt.show()

        line_fitter = LinearRegression()
        line_fitter.fit(x.values.reshape(-1,1), y)
        
        tmp.append(line_fitter.coef_[0])
    
    results.append(tmp)
    
    for k in range(2,len(tmp)):
        tmp[k] = tmp[k-1]+tmp[k]
    
    acc_res.append(tmp)

regression_df = pd.DataFrame(results)
acc_regression_df = pd.DataFrame(acc_res)

regression_df.to_csv('spiders/data/US_AAPL_'+str(datetime.today().strftime("%y%m%d"))+'_regression.csv')
acc_regression_df.to_csv('spiders/data/US_AAPL_'+str(datetime.today().strftime("%y%m%d"))+'_regression_acc.csv')

print("추세 파일 생성 완료")

추세 파일 생성 완료
