In [1]:
import pickle
import nltk
import string
import pandas as pd
import regex as re
import tensorflow as tf
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

pd.set_option('display.max_colwidth', 140)

Using TensorFlow backend.


In [2]:
f = open("Datasets/demo_corpus.txt", errors = 'ignore')
input_file = f.read()

In [3]:
input_file[0:500]

'CNA\tDaniel Wu, Daniel Dae Kim offer US$25,000 reward to catch suspect who assaulted elderly man https://cna.asia/2LvyOuK\nCNA\tCan you buy a Tesla with bitcoin? How the payments might work https://cna.asia/36UHimQ\nCNA\tMyanmar police fire rubber bullets at anti-coup protesters https://cna.asia/3aMc5TN\nCNA\tFacebook, Twitter outpaced by smaller platforms in fight against harmful content - agency https://cna.asia/3jwiHJX\nCNA\tBREAKING: Ho Ching to retire as CEO of Temasek Holdings; Dilhan Pillay Sandra'

In [4]:
parsedData = input_file.replace('\t', '\n').split('\n')
parsedData[0:8]

['CNA',
 'Daniel Wu, Daniel Dae Kim offer US$25,000 reward to catch suspect who assaulted elderly man https://cna.asia/2LvyOuK',
 'CNA',
 'Can you buy a Tesla with bitcoin? How the payments might work https://cna.asia/36UHimQ',
 'CNA',
 'Myanmar police fire rubber bullets at anti-coup protesters https://cna.asia/3aMc5TN',
 'CNA',
 'Facebook, Twitter outpaced by smaller platforms in fight against harmful content - agency https://cna.asia/3jwiHJX']

In [5]:
textList = parsedData[1::2]
textList[0:5]

['Daniel Wu, Daniel Dae Kim offer US$25,000 reward to catch suspect who assaulted elderly man https://cna.asia/2LvyOuK',
 'Can you buy a Tesla with bitcoin? How the payments might work https://cna.asia/36UHimQ',
 'Myanmar police fire rubber bullets at anti-coup protesters https://cna.asia/3aMc5TN',
 'Facebook, Twitter outpaced by smaller platforms in fight against harmful content - agency https://cna.asia/3jwiHJX',
 'BREAKING: Ho Ching to retire as CEO of Temasek Holdings; Dilhan Pillay Sandrasegara set to take her place from Oct 1 https://cna.asia/3rBeac5']

In [6]:
tweet_df = pd.DataFrame({'Text': textList})
tweet_df.head()

Unnamed: 0,Text
0,"Daniel Wu, Daniel Dae Kim offer US$25,000 reward to catch suspect who assaulted elderly man https://cna.asia/2LvyOuK"
1,Can you buy a Tesla with bitcoin? How the payments might work https://cna.asia/36UHimQ
2,Myanmar police fire rubber bullets at anti-coup protesters https://cna.asia/3aMc5TN
3,"Facebook, Twitter outpaced by smaller platforms in fight against harmful content - agency https://cna.asia/3jwiHJX"
4,BREAKING: Ho Ching to retire as CEO of Temasek Holdings; Dilhan Pillay Sandrasegara set to take her place from Oct 1 https://cna.asia/3r...


In [None]:
original_df = tweets_df

In [7]:
words = set(words.words())
wn = WordNetLemmatizer()

In [8]:
def clean_data(trial_text):
    for i in trial_text.index:
        trial_text.at[i,"Text"] = re.sub(r"http\S+", "locator", trial_text.at[i,"Text"])
        trial_text.at[i,"Text"] = re.sub(r"@\S+", "mention", trial_text.at[i,"Text"])
        trial_text.at[i,"Text"] = re.sub(r"#", "", trial_text.at[i,"Text"])
        trial_text.at[i,"Text"] = re.sub(r"RT", "", trial_text.at[i,"Text"])
        trial_text.at[i,"Text"] = ' '.join(word.strip(string.punctuation) for word in trial_text.at[i,"Text"].split())
        trial_text.at[i,"Text"] = wordpunct_tokenize(trial_text.at[i,"Text"].lower())
        trial_text.at[i,"Text"] = ' '.join([wn.lemmatize(word) for word in trial_text.at[i,"Text"]])
        trial_text.at[i,"Text"] = ' '.join(word for word in word_tokenize(trial_text.at[i,"Text"]) if word in words)
        trial_text.at[i,"Text"] = " ".join(word for word in word_tokenize(trial_text.at[i,"Text"]) if not (word.isalpha() and len(word)<2))
    trial_text = trial_text[trial_text['Text'] != ''].reset_index(drop=True)
    trial_text['num_words']=trial_text["Text"].str.split().str.len()
    trial_text = trial_text.drop(trial_text[trial_text.num_words < 2].index)
    trial_text = trial_text.drop(['num_words'], axis=1)
    return trial_text

In [20]:
clean_tweet_df = clean_data(tweet_df)
clean_tweet_df

Unnamed: 0,Text
0,dae kim offer reward to catch suspect who elderly man locator
1,can you buy with how the payment might work locator
2,police fire rubber bullet at anti coup protester locator
3,twitter by smaller platform in fight against harmful content agency locator
4,breaking ho ching to retire of holding set to take her place from locator
5,national dance drama follow the mother river locator
6,red plum blossom chamber concert locator
7,fusion of modern shirt with the traditional this evergreen piece is for the modern man with cultural root shop now at locator
8,happy year to all friend student and alumnus of mention mention mention mention mention
9,up to for sending with business locator via mention


In [10]:
#Load the tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

print("Number of vocabulary: {}".format(len(tokenizer.word_index)))

Number of vocabulary: 45164


In [11]:
print(tokenizer.word_index)



In [18]:
def text_processing(df):
    df_list = list(df.values)
    seq = tokenizer.texts_to_sequences(df_list)
    seq_pad = pad_sequences(seq, maxlen=46, padding='pre')
    return seq_pad

In [21]:
test_tweet = text_processing(clean_tweet_df['Text'])
test_tweet[0::4]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 9297, 1737,  422, 2611,    5,  871, 2758,   80, 5526,
         126,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1158,
        1481, 8449,    5, 3421,    8, 1792,  305,    5,  111,  133,  348,
          30,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  154,
         109,    5,   38,  151,  696,    7, 6411,    8,    3,    3,    3,
           3,    3]])

In [22]:
tf.random.set_seed(1234)
model = load_model('rnn_040221.hdf5')
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 46, 50)            2258250   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                10624     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,269,963
Trainable params: 2,269,963
Non-trainable params: 0
_________________________________________________________________




In [23]:
y_pred = (model.predict(test_tweet)>0.5).astype("int32")
y_pred

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0]])

In [25]:
original_df

Unnamed: 0,Text
0,dae kim offer reward to catch suspect who elderly man locator
1,can you buy with how the payment might work locator
2,police fire rubber bullet at anti coup protester locator
3,twitter by smaller platform in fight against harmful content agency locator
4,breaking ho ching to retire of holding set to take her place from locator
5,national dance drama follow the mother river locator
6,red plum blossom chamber concert locator
7,fusion of modern shirt with the traditional this evergreen piece is for the modern man with cultural root shop now at locator
8,happy year to all friend student and alumnus of mention mention mention mention mention
9,up to for sending with business locator via mention
