In [1]:
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
stopWords = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhangjun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zhangjun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# data preprocessing
def dataPrepos(text,stopkey = stopWords):
    l = []
    pos = {'JJ', 'NN','VB', 'PRP', 'WDT', 'WP', 'WRB', 'NNP', 'NNPS', 'VBG', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'RB'} 
    text_tokens = word_tokenize(text)
#     print(text_tokens)
    tags = nltk.pos_tag(text_tokens)
#     print(tags)
    for i in range(len(text_tokens)):
        if text_tokens[i] not in stopkey and tags[i][1] in pos:
            l.append(text_tokens[i])
    return l
# dataPrepos(text)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Loda datasets
train_bodies = pd.read_csv('../fnc-1/train_bodies.csv') # 1683 bodies 
train_stances = pd.read_csv('../fnc-1/train_stances.csv') # 49972 headlines and stances 
competition_test_stances = pd.read_csv('../fnc-1/competition_test_stances.csv') # 25413 headlines and stances 
competition_test_bodies = pd.read_csv('../fnc-1/competition_test_bodies.csv') # 904 bodies
# Join datasets
training_set = train_stances.join(train_bodies.set_index('Body ID'), on='Body ID')
competition_set = competition_test_stances.join(competition_test_bodies.set_index('Body ID'), on='Body ID')

labels_int = ['agree', 'disagree', 'discuss', 'unrelated']

training_set = pd.DataFrame(training_set.loc[:,['Headline', 'articleBody','Stance','Body ID']])
training_set.columns = ['text_a', 'text_b', 'labels', 'Body ID']
training_set["labels"] = training_set["labels"].apply(lambda x: labels_int.index(x))
train_df, val_df = train_test_split(training_set, random_state = 0)
competition_set = pd.DataFrame(competition_set.loc[:,['Headline', 'articleBody','Stance', 'Body ID']])
competition_set.columns = ['text_a', 'text_b', 'labels', 'Body ID']
competition_set["labels"] = competition_set["labels"].apply(lambda x: labels_int.index(x))
labels_test = list(competition_set["labels"])

In [4]:
def getKeywords_tfidf_train(data,stopkey = stopWords,topK=20):
    abstractList = data['articleBody']
    corpus = []  
    for text in tqdm(list(abstractList)):
        text = dataPrepos(text,stopkey) # Text preprocessing
        text = " ".join(text)
        corpus.append(text)
    # 1，Construct token occurrence matrix 
    stop_words = ['in', 'of', 'at', 'a', 'the']
    vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words, max_features = 500000)
    X = vectorizer.fit_transform(corpus) 

    # 2，Calculate the Tf-idf weights
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(X)

    # 3，Get keywords in the bag-of-words model
    words = vectorizer.get_feature_names()
    # 4，Get tf-idf matrix. a[i][j] represents tf-idf weight of word j in the i-th document
    weight = tfidf.toarray()

    # 5
    key_words_list = []
    for (body, w) in zip(abstractList, weight):
        loc = np.argsort(-w)
        key_words =[]
        for i in range(topK):
            # Print the top k tokens according to the Tf-idf weights.
#             print ('-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]]))
            key_words.append(words[loc[i]])
#         print ('\n')
        key_words_list.append(key_words)
    return key_words_list, vectorizer, tfidf_transformer

In [5]:
def getKeywords_tfidf_test(data,vectorizer, tfidf_transformer, stopkey = stopWords,topK=20):
    abstractList = data['articleBody']
    corpus = []  
    for text in tqdm(list(abstractList)):
        text = dataPrepos(text,stopkey) # Text preprocessing
        text = " ".join(text)
        corpus.append(text)
    # 1，Construct token occurrence matrix 
    stop_words = ['in', 'of', 'at', 'a', 'the']
#     vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words, max_features = 500000)
    X = vectorizer.transform(corpus) 

    # 2，Calculate the Tf-idf weights
#     tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.transform(X)

    # 3，Get keywords in the bag-of-words model
    words = vectorizer.get_feature_names()
    # 4，Get tf-idf matrix. a[i][j] represents tf-idf weight of word j in the i-th document
    weight = tfidf.toarray()

    # 5
    key_words_list = []
    for (body, w) in zip(abstractList, weight):
        loc = np.argsort(-w)
        key_words =[]
        for i in range(topK):
            # Print the top k tokens according to the Tf-idf weights.
#             print ('-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]]))
            key_words.append(words[loc[i]])
#         print ('\n')
        key_words_list.append(key_words)
    return key_words_list

In [6]:
train_bodies['keywords'], vectorizer, tfidf_transformer = getKeywords_tfidf_train(train_bodies)

100%|██████████| 1683/1683 [00:32<00:00, 40.63it/s]


In [7]:
train_bodies.head()

Unnamed: 0,Body ID,articleBody,keywords
0,0,A small meteorite crashed into a wooded area i...,"[meteorite, nicaragua, asteroid, crater, said,..."
1,4,Last week we hinted at what was to come as Ebo...,"[daily caller, caller, passenger, suit, airpor..."
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...,"[burger, australians, dean, mcdonald, charity,..."
3,6,"Posting photos of a gun-toting child online, I...","[isis, child, cooper, mr cooper, boy, got mart..."
4,7,At least 25 suspected Boko Haram insurgents we...,"[damboa, boko haram, boko, haram, tada, tada s..."


In [8]:
competition_test_bodies['keywords'] = getKeywords_tfidf_test(competition_test_bodies, vectorizer, tfidf_transformer)

100%|██████████| 904/904 [00:16<00:00, 53.28it/s]


In [9]:
competition_test_bodies.head()

Unnamed: 0,Body ID,articleBody,keywords
0,1,Al-Sisi has denied Israeli reports stating tha...,"[sisi, al sisi, gaza strip, gaza, extend, stri..."
1,2,A bereaved Afghan mother took revenge on the T...,"[taliban, afghan, daughter, suicide bomber, ba..."
2,3,CNBC is reporting Tesla has chosen Nevada as t...,"[tesla, reno, nevada, musk, cnbc, chosen, cost..."
3,12,A 4-inch version of the iPhone 6 is said to be...,"[iphone, inch, entry level, entry, apple, new,..."
4,19,GR editor’s Note\n\nThere are no reports in th...,"[isil, coalition, us, iraqi, al, anbar, salahu..."


In [10]:
# Export data with extracted keywords
train_bodies.to_csv('../features/train_bodies_topics.csv')
competition_test_bodies.to_csv('../features/competition_test_bodies_topics.csv')

In [11]:
training_set[training_set['labels'] ==0 ]

Unnamed: 0,text_a,text_b,labels,Body ID
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,0,158
5,'Nasa Confirms Earth Will Experience 6 Days of...,Thousands of people have been duped by a fake ...,0,154
8,Banksy 'Arrested & Real Identity Revealed' Is ...,If you’ve seen a story floating around on your...,0,1739
11,Woman detained in Lebanon is not al-Baghdadi's...,An Iraqi official denied that a woman detained...,0,1468
17,"No, Robert Plant Didn’t Rip Up an $800 Million...",Led Zeppelin fans will be disappointed to lear...,0,295
24,NET Extra: Back-from-the-dead Catholic priest ...,A 71 years old cleric Father John Micheal O’ne...,0,1014
25,Rumor debunked: RoboCop-style robots are not p...,Knightscope co-founder Stacy Stephens said rum...,0,633
36,Fisherman lands 19 STONE catfish which could b...,"Dino Ferrari hooked the whopper wels catfish, ...",0,2161
46,Student accidentally sets college on fire duri...,He popped the question — and burned down his c...,0,1592
55,Macaulay Culkin Hasn’t Died Despite What Every...,Claim: Actor Macaulay Culkin has died.\n\nFALS...,0,759


In [12]:
def ngrams_1_2(input, n=2):
    input = input.lower().split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(' '.join(input[i:i + n]))
    output += input
    return output

In [13]:
# Extract keywords features from the headlines
# Feature 1: 1 if we can see any one of the keywords of the body and 0 others
# Feature 2: The percent of ngrams, unigrams of the headlines that are in the keywords of the body
def getKeywordsFeature(training_set):
    keywords_hit = []
    keywords_hit_ratio = []
    for index in range(len(training_set)):
        h = training_set.loc[index, 'text_a']
        Body_ID = training_set.loc[index, 'Body ID']
        keywords_0 = list(train_bodies[train_bodies['Body ID']==Body_ID]['keywords'])[0]
        h_ngrams = ngrams_1_2(h, 2)
        count = 0
        for two_gram in h_ngrams:
            if two_gram in keywords_0:
                count += 1
        ratio = count / len(h_ngrams)
        if ratio > 0:
            keywords_hit.append(1)
        else:
            keywords_hit.append(0)
        keywords_hit_ratio.append(ratio)
    training_set['keywords_hit'] = keywords_hit
    training_set['keywords_hit_ratio'] = keywords_hit_ratio
    return training_set
training_set = getKeywordsFeature(training_set)

In [14]:
training_set.head(20)

Unnamed: 0,text_a,text_b,labels,Body ID,keywords_hit,keywords_hit_ratio
0,Police find mass graves with at least '15 bodi...,Danny Boyle is directing the untitled film\n\n...,3,712,0,0.0
1,Hundreds of Palestinians flee floods in Gaza a...,Hundreds of Palestinians were evacuated from t...,0,158,1,0.095238
2,"Christian Bale passes on role of Steve Jobs, a...",30-year-old Moscow resident was hospitalized w...,3,137,0,0.0
3,HBO and Apple in Talks for $15/Month Apple TV ...,(Reuters) - A Canadian soldier was shot at the...,3,1034,0,0.0
4,Spider burrowed through tourist's stomach and ...,"Fear not arachnophobes, the story of Bunbury's...",1,1923,1,0.052632
5,'Nasa Confirms Earth Will Experience 6 Days of...,Thousands of people have been duped by a fake ...,0,154,1,0.333333
6,Accused Boston Marathon Bomber Severely Injure...,A British fighter who travelled to Iraq to sto...,3,962,0,0.0
7,Identity of ISIS terrorist known as 'Jihadi Jo...,"Adding to Apple's iOS 8 launch troubles, a rep...",3,2033,0,0.0
8,Banksy 'Arrested & Real Identity Revealed' Is ...,If you’ve seen a story floating around on your...,0,1739,1,0.08
9,British Aid Worker Confirmed Murdered By ISIS,The British Islamic State militant who has fea...,3,882,1,0.307692
