In [1]:
import csv

In [5]:
# create a class to read the dataset
class Dataset():
  def __init__(self, train_stance, test_stance, train_body, test_body):
    self.train_stance = train_stance
    self.test_stance = test_stance
    self.train_body = train_body
    self.test_body = test_body

    print("Dataset length:")

    self.train_stances = self.read_stance(self.train_stance)
    self.test_stances = self.read_stance(self.test_stance)
    self.train_bodies = self.read_body(self.train_body)
    self.test_bodies = self.read_body(self.test_body)

    print("Total train stances: " + str(len(self.train_stances)))
    print("Total test stances: " + str(len(self.test_stances)))
    print("Total train bodies: " + str(len(self.train_bodies)))
    print("Total test bodies: " + str(len(self.test_bodies)))

  def read_stance(self, path):
    rows = []
    with open(path, encoding='utf-8', errors='ignore') as csvfile:
      r = csv.DictReader(csvfile)
      for row in r:
        rows.append([row['Body ID'], row['Headline'], row['Stance']])
    return rows

  def read_body(self, path):
    rows = []
    #with open(path, encoding='utf-8') as csvfile:
    with open(path, encoding="utf8", errors='ignore') as csvfile:
      r = csv.DictReader(csvfile)
      for row in r:
        rows.append([row['Body ID'], row['articleBody']])
        #rows[row['Body ID']] = row['articleBody']
    return rows

In [6]:
# load the data
data = Dataset('./fnc1/train_stances.csv', './fnc1/competition_test_stances.csv', './fnc1/train_bodies.csv', './fnc1/competition_test_bodies.csv')

Dataset length:
Total train stances: 49972
Total test stances: 25413
Total train bodies: 1683
Total test bodies: 904


In [7]:
print("(body#, headline, result):", data.train_stances[0])
print("(body#, headline, result):", data.test_stances[0])

(body#, headline, result): ['0', 'Soldier shot, Parliament locked down after gunfire erupts at war memorial', 'unrelated']
(body#, headline, result): ['2008', 'Ferguson riots: Pregnant woman loses eye after cops fire BEAN BAG round through car window', 'unrelated']


In [8]:
print("(body#, article):", data.train_bodies[0])
print("(body#, article):", data.test_bodies[0])

(body#, article): ['0', 'A small meteorite crashed into a wooded area in Nicaragua\'s capital of Managua overnight, the government said Sunday. Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city\'s airport, the Associated Press reports. \n\nGovernment spokeswoman Rosario Murillo said a committee formed by the government to study the event determined it was a "relatively small" meteorite that "appears to have come off an asteroid that was passing close to Earth." House-sized asteroid 2014 RC, which measured 60 feet in diameter, skimmed the Earth this weekend, ABC News reports. \nMurillo said Nicaragua will ask international experts to help local scientists in understanding what happened.\n\nThe crater left by the meteorite had a radius of 39 feet and a depth of 16 feet,  said Humberto Saballos, a volcanologist with the Nicaraguan Institute of Territorial Studies who was on the committee. He said it is still not clear if the meteorite disintegrated

In [9]:
# pre-process function (lowercase, stopwords, lemmatization)
from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

porter = PorterStemmer()
stopwords = stopwords.words("english")

def preprocess(processed_dataset):
  for w in processed_dataset:
    words = []
    w[1] = word_tokenize(w[1])
    for token in w[1]:
      lower = token.lower()
      if lower not in stopwords and lower.isalpha():
        word = porter.stem(lower)
        words.append(word)
    w[1] = words
  return processed_dataset

In [10]:
# call pre-process functions for train and test
train_headline = preprocess(data.train_stances)
test_headline = preprocess(data.test_stances)
train_content = preprocess(data.train_bodies)
test_content = preprocess(data.test_bodies)

In [11]:
print(train_headline[0])
print(test_headline[0])
print(train_content[0])
print(test_content[0])

['0', ['soldier', 'shot', 'parliament', 'lock', 'gunfir', 'erupt', 'war', 'memori'], 'unrelated']
['2008', ['ferguson', 'riot', 'pregnant', 'woman', 'lose', 'eye', 'cop', 'fire', 'bean', 'bag', 'round', 'car', 'window'], 'unrelated']
['0', ['small', 'meteorit', 'crash', 'wood', 'area', 'nicaragua', 'capit', 'managua', 'overnight', 'govern', 'said', 'sunday', 'resid', 'report', 'hear', 'mysteri', 'boom', 'left', 'deep', 'crater', 'near', 'citi', 'airport', 'associ', 'press', 'report', 'govern', 'spokeswoman', 'rosario', 'murillo', 'said', 'committe', 'form', 'govern', 'studi', 'event', 'determin', 'rel', 'small', 'meteorit', 'appear', 'come', 'asteroid', 'pass', 'close', 'earth', 'asteroid', 'rc', 'measur', 'feet', 'diamet', 'skim', 'earth', 'weekend', 'abc', 'news', 'report', 'murillo', 'said', 'nicaragua', 'ask', 'intern', 'expert', 'help', 'local', 'scientist', 'understand', 'happen', 'crater', 'left', 'meteorit', 'radiu', 'feet', 'depth', 'feet', 'said', 'humberto', 'saballo', 'volc

In [12]:
# put all sentences into one collection for word2vec training.
# It takes word tokenized sentences, which looks like ["hello", "world", ... ]

sent_collection = []

def sent_list(s_list, t_list):
  for sent in s_list:
    t_list.append(sent[1])
  return

sent_list(train_headline, sent_collection)
sent_list(test_headline, sent_collection)
sent_list(train_content, sent_collection)
sent_list(test_content, sent_collection)

In [13]:
print(sent_collection[0])

['soldier', 'shot', 'parliament', 'lock', 'gunfir', 'erupt', 'war', 'memori']


In [14]:
# to see how many unique words in the collection
bag = []
for s in sent_collection:
  for w in s:
    bag.append(w)

p = set(bag)
print(len(p))


18084


In [15]:
# train the word2vec with customized words because the stemming makes some words not recognizable, such as "polic" and "strang"
# details  https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

#import multiprocessing
from gensim.models import Word2Vec

#cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=1,
                    window=2,
                    size=100,
                    sample=6e-5, 
                    alpha=0.03, 
                    min_alpha=0.0007, 
                    negative=20)


ModuleNotFoundError: No module named 'gensim'

In [None]:
w2v_model.build_vocab(sent_collection)

In [None]:
w2v_model.train(sent_collection, total_examples=w2v_model.corpus_count, epochs=30)

In [None]:
# this makes the memory more efficient since we do not plan tot train any further
w2v_model.init_sims(replace=True)

In [None]:
len(w2v_model.wv.vocab.keys())

In [None]:
print(model.wv.vector_size)

In [None]:
w2v_model.wv.get_vector('polic')

In [None]:
w2v_model.wv.similarity('polic', 'strang')

In [None]:
# combine headline and body
import nltk
def comb_list (stance, body, target):
  for i in body:
    for j in stance:
      if j[0] == i[0]:
        i[1] = nltk.FreqDist(i[1])
        target.append([j[0], j[1], i[1], j[2]])
  return


In [None]:
# train set
train_set = []
test_set = []

comb_list(train_headline, train_content, train_set)
comb_list(test_headline, test_content, test_set)

In [None]:
print(len(train_set))

In [None]:
print(len(test_set))