In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import string
from gensim.models import Word2Vec
import gensim

In [2]:
df = pd.read_csv("data/Combined_News_DJIA.csv")

In [3]:
def normalizer(raw_string):
    """
    input:  string raw
    :return: clean string that only contain letter and number, and remove stop word.
    """
    raw_string = raw_string[1:]  # Remove b in every first line
    wordnet_lemmatizer = WordNetLemmatizer()
    remove_tag = re.sub(r'@\w+', "", raw_string)
    remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
    raw_string = remove_tag.translate(remove_punctuation_map)
    only_word = re.sub("[^a-zA-Z0-9]", " ", raw_string)
    tokens = nltk.word_tokenize(only_word)
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    clean_text = " ".join(lemmas)
    
    return clean_text

# Normalize data and Init variable

In [4]:
df[df.columns[2:]] = df[df.columns[2:]].applymap(lambda x: normalizer(x) if isinstance(x, str) else str(x))

In [5]:
sentences_terms = df.iloc[:, 2:].values
sentences_terms = sentences_terms.flatten()

In [6]:
sentences = ""
for sentence in sentences_terms:
    sentences += str(sentence)

In [7]:
df['Sentence'] = df[df.columns[2:]].apply(np.sum, axis = 1)

In [8]:
df = df[['Date', "Label", "Sentence"]]

In [9]:
df.head()

Unnamed: 0,Date,Label,Sentence
0,2008-08-08,0,georgia down two russian warplane country move...
1,2008-08-11,1,wont america nato help u wont help u help iraq...
2,2008-08-12,0,remember adorable 9yearold sang opening ceremo...
3,2008-08-13,0,u refuse israel weapon attack iran reportpresi...
4,2008-08-14,1,expert admit legalise drugwar south osetia 89 ...


# Get BoK and Category Tag vocab

In [10]:
def process_line(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess (sentence)
sentences_terms_bag_of_word = list(process_line(sentences_terms))

In [11]:
model=Word2Vec(sentences_terms_bag_of_word, min_count=2)

In [12]:
bag_of_keywords=set(['rise','drop','fall','gain','surge','shrink','jump','slump'])
stop=False
bok_size=1000
for i in range(10):
    new_words=[]
    if stop: 
        break
    for k in bag_of_keywords:
        if k in model.wv.vocab.keys():
            new_words.extend(model.most_similar(k))
    for n in new_words:
        if n[0].islower() and len(n[0])>3 and n[0].isalpha():
            bag_of_keywords.add(n[0])
            if len(bag_of_keywords)==bok_size:
                stop=True
                break

bag_of_keywords=np.array(list(bag_of_keywords))

  # Remove the CWD from sys.path while we load stuff.


In [13]:
category_tags=set(['published','presented','unveil','investment','bankrupt','government','acquisition','suit'])
stop=False
cate_size=100
for i in range(10):
    new_words=[]
    if stop:break
    for k in category_tags:
        if k in model.wv.vocab.keys():
            new_words.extend(model.most_similar(k))
    for n in new_words:
        if n[0].islower() and len(n[0])>3 and n[0].isalpha():
            category_tags.add(n[0])
            if len(category_tags)==cate_size:
                stop=True
                break
                
category_tags=np.array(list(category_tags))

  if __name__ == '__main__':


In [25]:
category_tags

array(['leftist', 'cameron', 'racist', 'presented', 'dialogue', 'nesco',
       'technique', 'complain', 'position', 'match', 'chicago', 'faced',
       'frank', 'owner', 'condemned', 'practice', 'deficit', 'taboo',
       'newly', 'tone', 'poland', 'cotland', 'sweden', 'maple', 'lobal',
       'historical', 'estonia', 'recall', 'acquisition', 'giant', 'award',
       'enclave', 'play', 'enjoy', 'annexation', 'suit', 'demolish',
       'unveil', 'project', 'cited', 'oregon', 'spyware', 'demolition',
       'david', 'elton', 'avid', 'permanently', 'jimmy', 'criticised',
       'treat', 'academic', 'statehood', 'intimidated', 'haaretz',
       'follow', 'offshore', 'awarded', 'shock', 'kicked', 'abbott',
       'inlands', 'published', 'escalate', 'song', 'surgeon', 'eviction',
       'bankrupt', 'opposed', 'populist', 'renewables', 'parliament',
       'invite', 'government', 'investment', 'express', 'anchor',
       'indicted', 'association', 'currency', 'error', 'strongly',
       'con

# Create Tf-idf by BoK vocab

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

bok_tfidf = TfidfVectorizer(lowercase=False,min_df=1,use_idf=True,vocabulary=bag_of_keywords)

bok_count = CountVectorizer(min_df=1,vocabulary=bag_of_keywords)
category_count = CountVectorizer(min_df=1,vocabulary=category_tags)

In [16]:
X_bok_tfidf=bok_tfidf.fit_transform(df['Sentence'])
X_bok_tfidf = pd.DataFrame(X_bok_tfidf.toarray(), columns=bok_tfidf.get_feature_names())
X_bok_tfidf.head()

Unnamed: 0,rchaeologists,normally,bunker,structural,stuck,promising,book,temperature,jump,musician,...,fracking,overfishing,attract,staggering,expensive,enewable,youngest,visitor,drink,extensive
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
res = pd.concat([df, X_bok_tfidf], axis=1)
res.head()

Unnamed: 0,Date,Label,Sentence,rchaeologists,normally,bunker,structural,stuck,promising,book,...,fracking,overfishing,attract,staggering,expensive,enewable,youngest,visitor,drink,extensive
0,2008-08-08,0,georgia down two russian warplane country move...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2008-08-11,1,wont america nato help u wont help u help iraq...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2008-08-12,0,remember adorable 9yearold sang opening ceremo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2008-08-13,0,u refuse israel weapon attack iran reportpresi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2008-08-14,1,expert admit legalise drugwar south osetia 89 ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Caculate Popularity Score

In [18]:
def pmi_score(freg_w_label, n, freg_w, freq_label):
    return np.log((freg_w_label * n) / (freg_w * freq_label))

In [19]:
sum_df = df.groupby(['Label']).Sentence.sum()
count_df = df.groupby(['Label']).Sentence.count()

neg_sentences = [sum_df[0]]
pos_sentences = [sum_df[1]]

freg_neg_sentences = count_df[0]
freg_pos_sentences = count_df[1]

freg_neg_word = bok_count.fit_transform(neg_sentences).toarray()
freg_pos_word = bok_count.fit_transform(pos_sentences).toarray()
freq_word = bok_count.fit_transform([sentences]).toarray()

pmi_neg = pmi_score(freg_neg_word, df.Sentence.count(), freq_word, freg_neg_sentences)
pmi_pos = pmi_score(freg_pos_word, df.Sentence.count(), freq_word, freg_pos_sentences)
ps = pmi_pos - pmi_neg
ps.shape

  
  


(1, 1000)

In [20]:
tfidf = res.iloc[:, 3:].values
result = tfidf * ps
temp = pd.DataFrame(result, columns=bok_tfidf.get_feature_names())
res.iloc[:, 3:] = temp
res.head()

  


Unnamed: 0,Date,Label,Sentence,rchaeologists,normally,bunker,structural,stuck,promising,book,...,fracking,overfishing,attract,staggering,expensive,enewable,youngest,visitor,drink,extensive
0,2008-08-08,0,georgia down two russian warplane country move...,,0.0,-0.0,0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,,-0.0,-0.0,0.0,-0.0
1,2008-08-11,1,wont america nato help u wont help u help iraq...,,0.0,-0.0,0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,,-0.0,-0.0,0.0,-0.0
2,2008-08-12,0,remember adorable 9yearold sang opening ceremo...,,0.0,-0.0,0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,,-0.0,-0.0,0.0,-0.0
3,2008-08-13,0,u refuse israel weapon attack iran reportpresi...,,0.0,-0.0,0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,,-0.0,-0.0,0.0,-0.0
4,2008-08-14,1,expert admit legalise drugwar south osetia 89 ...,,0.0,-0.0,0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.0,0.0,,-0.0,-0.0,0.0,-0.0


In [21]:
def process_category_tag(sentence):
    freq_category = category_count.fit_transform(sentence).toarray()
    freq_category[freq_category == 0] = -99
    freq_category = np.array(list(map(np.log, freq_category)))
    return freq_category
    
sentences_row = res.Sentence.values
category_array = process_category_tag(sentences_row)
category_pd = pd.DataFrame(category_array, columns=category_count.get_feature_names())
category_pd = category_pd.fillna(0)
category_pd.head()

  after removing the cwd from sys.path.


Unnamed: 0,leftist,cameron,racist,presented,dialogue,nesco,technique,complain,position,match,...,surplus,discrimination,sector,carlos,swap,golden,accepted,stimulus,party,proper
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Merge all dimension and clean NaN value

In [22]:
res = pd.concat([res, category_pd], axis=1)
res.drop(["Sentence", "Date"], axis=1, inplace=True)
res.replace([np.inf, -np.inf, -0.0], np.nan, inplace=True)
res.fillna(0, inplace=True)
res.head()

Unnamed: 0,Label,rchaeologists,normally,bunker,structural,stuck,promising,book,temperature,jump,...,surplus,discrimination,sector,carlos,swap,golden,accepted,stimulus,party,proper
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
res.to_csv("data/data_clean_test.csv", index=False)