# 1. Fake News Challenge

http://www.fakenewschallenge.org/

https://github.com/FakeNewsChallenge/fnc-1-baseline

# 2. Technical Links

## NLTK

http://textminingonline.com/dive-into-nltk-part-iv-stemming-and-lemmatization

https://www.dataquest.io/blog/natural-language-processing-with-python/

http://www.nltk.org/book/ch03.html


## Tensorflow
https://www.tensorflow.org/tutorials/recurrent

https://www.tensorflow.org/programmers_guide/reading_data

## Sklearn

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

## Markdown

https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet

## Keras

https://keras.io/getting-started/sequential-model-guide/

# 3. Papers

https://www.ijcai.org/Proceedings/16/Papers/408.pdf

https://www.overleaf.com/5276203cwvkhf#/16617343/

In [14]:
## test code for keras

In [1]:
import sys
HOME_DIR='.'
FNC_PATH='{}/fnc-1-baseline'.format(HOME_DIR)

#must add local path to the FNC utils, so we can import and reuse them
sys.path.append(FNC_PATH + '/utils/')

In [2]:
import gensim

#Google news vectors link https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view
#We can change it with fastText, but it's 6GB and it's super slow to download

W2V_MODEL='{}/model/GoogleNews-vectors-negative300.bin'.format(HOME_DIR)
# Load Google's pre-trained Word2Vec model.
w2vmodel = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL, binary=True)  




In [3]:
import pandas as pd

def read_data(path=FNC_PATH + '/fnc-1'):
    stances = pd.read_csv(path + '/train_stances.csv')
    stances.set_index('Body ID', inplace=True)
    
    bodies = pd.read_csv(path + '/train_bodies.csv')
    bodies.set_index('Body ID', inplace=True)
    
    ds = pd.merge(bodies, stances, how='inner', right_index=True, left_index=True)
    
    return ds

In [4]:
from sklearn.model_selection import train_test_split

def get_data_split(ds, test_size = 0.2):
    train, validation = train_test_split(ds, test_size = test_size)
    return train, validation

In [5]:
ds = read_data()

train, validation = get_data_split(ds)
print ("Train examples: %d"%len(train))
print ("Test examples: %d"%len(validation))

print ()
print (train['Stance'].value_counts())

Train examples: 39977
Test examples: 9995

unrelated    29242
discuss       7163
agree         2908
disagree       664
Name: Stance, dtype: int64


In [6]:
import numpy as np
import nltk
import re

from sklearn import feature_extraction
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot

def normalize_word(w):
    return wnl.lemmatize(w.lower()).lower()

def tokenize_sentenses(sentences):
    return sentences.apply(lambda s: nltk.word_tokenize(s))

def lemmatize_tokens(series):
    return series.apply(lambda tokens: [normalize_word(t) for t in tokens])

def remove_stopwords(words):
    # Removes stopwords from a list of tokens
    return words.apply(lambda l: [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS])

def trainTFIDF(corpus, max_ngram):
    vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram), lowercase=True, stop_words="english", min_df=10, max_df=100)
    vectorizer.fit(corpus)
    
    return vectorizer

def encode_pos(word):
    return ['_'.join(x) for x in nltk.pos_tag(word)]

def doc2vec(terms):
    return np.mean([w2vmodel[w] if w in w2vmodel.vocab else np.zeros(300) for w in terms], axis=0).tolist()

def prepare_features(dataset):
    from scipy.sparse import hstack
    
    #Usefull link https://www.dataquest.io/blog/natural-language-processing-with-python/
    tokens = tokenize_sentenses(dataset['Headline'])
    lemmas = lemmatize_tokens(tokens)
    no_stop_words = remove_stopwords(lemmas)
    pos_tags = no_stop_words.apply(encode_pos)
    tf_idf = vectorizer.transform(dataset['Headline'])
    #np.asmatrix(matrix.tolist())
    embeddings = np.asmatrix(no_stop_words.apply(doc2vec).tolist())
    
    return hstack((tf_idf, embeddings)) 

le = preprocessing.LabelEncoder()
wnl = nltk.WordNetLemmatizer()
vectorizer = trainTFIDF(train['Headline'], 2)
matrix = prepare_features(train)
train_labels = dense_to_one_hot(le.fit_transform(train['Stance']), 4)

print (matrix.shape)

[nltk_data] Downloading package punkt to C:\Users\Henry
[nltk_data]     Lin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Henry
[nltk_data]     Lin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Henry Lin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
(39977, 8950)


In [7]:
import gc

w2vmodel = None
vectorizer = None
le = None
wnl = None

gc.collect()

0

In [11]:
def train_model(x_train, y_train):
    from keras import metrics
    
    x_train = x_train.toarray()
    input_size = x_train.shape[1]
    output_size = 4
    
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation
    from keras.optimizers import SGD

    model = Sequential()
    # Dense(64) is a fully-connected layer with 64 hidden units.
    # in the first layer, you must specify the expected input data shape:
    # here, 20-dimensional vectors.
    model.add(Dense(1024, activation='relu', input_dim=input_size))
    model.add(Dropout(0.5))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(output_size, activation='softmax'))

    #sgd = SGD(lr=1e-04, decay=2, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer="rmsprop",
                  metrics=['accuracy'])

    model.fit(x_train, y_train,
              epochs=15,
              batch_size=128)
#     score = model.evaluate(x_test, y_test, batch_size=16)
#     print (score)
    return model
model = train_model(matrix, train_labels)

kwargs passed to function are ignored with Tensorflow backend


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
import keras
print(keras.__version__)

In [13]:
model.evaluate(matrix.toarray(), train_labels)



[0.67397939837091925, 0.73041999150107362]