In [3]:
# Import libraries
import os
import pandas as pd
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout

## Read data

In [4]:
path = os.path.abspath('.')
dataframe = pd.read_csv(os.path.join(path, 'data', 'data.txt'), delimiter='\t', names=['article', 'id', 'label'])

In [5]:
dataframe

Unnamed: 0,article,id,label
0,"Et tu, Rhody? A recent editorial in the Provi...",727600136,non-propaganda
1,A recent post in The Farmington Mirror — our t...,731714618,non-propaganda
2,"President Donald Trump, as he often does while...",731714635,non-propaganda
3,"February is Black History Month, and nothing l...",728627182,non-propaganda
4,"The snow was so heavy, whipped up by gusting w...",728627443,non-propaganda
5,Four months after the Sandy Hook School shooti...,732126660,non-propaganda
6,The first major newspaper article about Donald...,728144791,non-propaganda
7,"For three years, starting in 2008, New York ar...",728605281,non-propaganda
8,President Donald Trump's tumultuous administra...,731383701,non-propaganda
9,With Hartford on edge about the future of Aetn...,734075146,non-propaganda


In [6]:
dataframe['article'][0]

'Et tu, Rhody?  A recent editorial in the Providence Journal cataloged everything it could find wrong with Connecticut and ended with this suggestion: “Gov. Gina Raimondo should see if at least some of those jobs could come to Rhode Island. It is certainly less risky than the Nutmeg State.”  We beg your pardon.  The state with world-famous pension problems and persistent economic issues of its own is “less risky”?  The Journal itself reported just a few weeks ago on Rhode Island’s own significant economic problems, which in many ways reflect Connecticut’s.  Rhode Island enjoys a legacy of corruption that not even Connecticut can match. The ProJo won a Pulitzer Prize in 1994 for uncovering widespread corruption within its own court system.  What, exactly, is to be gained from moving to Rhode Island?  Like Connecticut, Rhode Island has an income tax and an estate tax with comparable rates. (Forbes magazine listed it as one of the states “Where Not To Die.” Connecticut made the list, too.

In [7]:
dataframe['label'].value_counts()

non-propaganda    31965
propaganda         4021
Name: label, dtype: int64

## Data Processing

In [8]:
value_mapping = {'propaganda': 1, 'non-propaganda': 0}
dataframe['target'] = dataframe['label'].map(value_mapping)

In [9]:
class TextPreprocessor():
    def __init__(self):
        # clean_text helpers
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = stopwords.words("english")
        self.vectorizer = CountVectorizer()
        self.cvecTtokenizer = self.vectorizer.build_tokenizer()
        # prepare_for_model helpers
        self.max_features = 6000 # I pulled this out of my ass
        self.max_sequence_length = 348 # I also pulled this out of my ass
        self.sequence_tokenizer = Tokenizer(num_words = self.max_features)
        
    def clean_text(self, text):
        # Remove special chars and punctuation
        text = " ".join(self.cvecTtokenizer(text))
        # lowcase
        text = text.lower()
        # Lematize
        text = [self.lemmatizer.lemmatize(token) for token in text.split(" ")]
        text = [self.lemmatizer.lemmatize(token, "v") for token in text]
        # Remove stopwords
        text = [word for word in text if not word in self.stop_words]

        text = " ".join(text)
        return text
    
    def prepare_for_model(self, text):
        return pad_sequences(self.sequence_tokenizer.texts_to_sequences(text), maxlen = self.max_sequence_length)
    
    def fit_sequence_tokenizer(self, texts_to_fit):
        self.sequence_tokenizer.fit_on_texts(texts_to_fit)
        
    def save_sequence_tokenizer(self, path):
        with open(path, 'wb') as handle:
            pickle.dump(self.sequence_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    def load_sequence_tokenizer(self, path):
        with open(path, 'rb') as handle:
            self.sequence_tokenizer = pickle.load(handle)
        
        
textPreprocessor = TextPreprocessor()

In [10]:
dataframe['article_prepared'] = dataframe['article'].apply(textPreprocessor.clean_text)

In [11]:
#dataframe

In [12]:
# Remember when I said I pulled numbers out of my ass? I actually guestimated them based on this...
dataframe['article_prepared'].apply(lambda x: len(x.split(" "))).describe()

count    35986.000000
mean       346.711916
std        289.532668
min          4.000000
25%        171.000000
50%        280.000000
75%        449.000000
max      12122.000000
Name: article_prepared, dtype: float64

In [13]:
textPreprocessor.fit_sequence_tokenizer(dataframe['article_prepared'])

In [14]:
textPreprocessor.prepare_for_model(dataframe['article_prepared'][0])

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0, 1493]])

In [15]:
save_tokenizer_path = './model/sequence_tokenizer.pickle'
textPreprocessor.save_sequence_tokenizer(save_tokenizer_path)

In [16]:
textPreprocessor.load_sequence_tokenizer(save_tokenizer_path)

In [17]:
textPreprocessor.prepare_for_model(dataframe['article_prepared'][0])

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0, 1493]])

#### TODO: Clean up that mess above...

## Modelling

In [18]:
class PropagandaDetector():
    def __init__(self, textPreprocessor, model_path = None):
        self.textPreprocessor = textPreprocessor
        
        if model_path:
            self.model = load_model(model_path)
        else:
            self.embed_size = 200 # Remember where I get my magic numbers from?
            self.model = Sequential()
            self.model.add(Embedding(textPreprocessor.max_features, self.embed_size))
            self.model.add(Bidirectional(LSTM(32, return_sequences=True)))
            self.model.add(GlobalMaxPool1D())
            self.model.add(Dense(20, activation="relu"))
            self.model.add(Dropout(0.1))
            self.model.add(Dense(1, activation="sigmoid"))
            self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            self.model.summary()
        
    def train(self, features, labels):        
        self.model.fit(features, labels, batch_size=128, epochs=2)
        
    def detect_propaganda(self, text):
        cleaned = self.textPreprocessor.clean_text(text)
        prepared = self.textPreprocessor.prepare_for_model([cleaned])
        confidence = self.model.predict(prepared, verbose=1)[0][0]
        return (confidence >= 0.5, confidence)
    
    def save_model(self, path):
        self.model.save(path)
    

In [19]:
detector = PropagandaDetector(textPreprocessor)

W1003 13:06:45.119067 20160 deprecation.py:323] From C:\Users\Gencho\Anaconda3\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         1200000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 64)          59648     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 21        
Total params: 1,260,969
Trainable params: 1,260,969
Non-trainable params: 0
____________________________________________

In [20]:
dataframe['label'].value_counts()

non-propaganda    31965
propaganda         4021
Name: label, dtype: int64

In [21]:
dataframe_train, dataframe_val = train_test_split(dataframe, test_size = 0.25, stratify=dataframe["target"])

In [22]:
features_train = textPreprocessor.prepare_for_model(dataframe_train['article_prepared'])

In [23]:
labels_train = dataframe_train['target']

In [24]:
detector.train(features_train, labels_train)

W1003 13:06:56.703081 20160 deprecation_wrapper.py:119] From C:\Users\Gencho\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Epoch 1/2
Epoch 2/2


In [25]:
detector.save_model("./model/detector")

## Evaluation

In [26]:
features_val = textPreprocessor.prepare_for_model(dataframe_val['article_prepared'])

In [27]:
labels_val = dataframe_val['target'].ravel()

In [28]:
predictions = detector.model.predict(features_val)

In [29]:
predictions = predictions.round()

In [30]:
print('F1-score: {0}'.format(f1_score(predictions, labels_val)))
confusion_matrix(predictions, labels_val)

F1-score: 0.7800687285223368


array([[7932,  324],
       [  60,  681]], dtype=int64)

In [31]:
detector = PropagandaDetector(textPreprocessor, "./model/detector")

In [32]:
detector.detect_propaganda(dataframe['article'][0])



(False, 0.0022902624)

#### TODO: Clean up all that above aswell...