In [1]:
import numpy as np
import re
import string
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
train_file = "train.data.txt"
dev_file = "dev.data.txt"
test_file = "test.data.txt"
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
def tokenize_tweet(string_data:str):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenized = nltk.RegexpTokenizer('\w+')
    data = string_data.replace('\n', '')
    data = data.lower()
    data = re.sub('https?://\S+|www\.\S+', '', data)
    data = re.sub('[%s]' % re.escape(string.punctuation), '', data)
    # data = [wordnet_lemmatizer.lemmatize(word) for word in data.split(' ')]
    data = ' '.join([i for i in data.split(' ') if i not in stopwords])
    return data

train_data = pd.read_csv('./%s.csv'%train_file,keep_default_na=False)
dev_data = pd.read_csv('./%s.csv'%dev_file,keep_default_na=False)
test_data = pd.read_csv('./%s.csv'%test_file,keep_default_na=False)
def preprocess_token(df, dropNa=True):
    data = df.copy()
    data['main_tweet'] = data['main_tweet'].fillna('')
    if dropNa:
        data.replace('', np.nan, inplace=True)
        data.dropna(subset=['main_tweet'], inplace=True)
    text = data['main_tweet'].apply(lambda x: tokenize_tweet(x))
    data['main_tweet'] = text
    return data


train_data=preprocess_token(train_data)
train_data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guoyi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guoyi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,main_tweet,main_tweet_id,verified,followers,replies,label
0,5 regularly rinsing nose saline help prevent i...,1250219300389974016,False,410,"[{""tweet_id"": 1250219116993974272, ""tweet"": ""4...",nonrumour
1,french police chief killed charliehebdo attack,554886875303780352,True,3229894,"[{""tweet_id"": 554959644125167617, ""tweet"": ""De...",rumour
2,coronavirus disease covid19 advice public\r\r✳...,1237901309011021825,False,613,"[{""tweet_id"": 1237901311439450112, ""tweet"": ""I...",nonrumour
3,ottawa police confirm multiple suspects shooti...,524958128392376320,True,19783124,"[{""tweet_id"": 524961934064754688, ""tweet"": ""@W...",nonrumour
4,primary focus government isnt alleviate suffer...,1239295488677085185,False,4889,[],nonrumour
...,...,...,...,...,...,...
1889,4 cannot transmitted goods manufactured china ...,1237545128828342277,False,631,"[{""tweet_id"": 1237545126278258703, ""tweet"": ""#...",nonrumour
1890,desperate ted cruz claims planned parenthood s...,671181758692507648,True,143090,"[{""tweet_id"": 671200376843067392, ""tweet"": ""@B...",rumour
1891,thoughts prayers enough pres obama speaks mass...,672513234419638273,True,17449031,"[{""tweet_id"": 672513853645717504, ""tweet"": ""@A...",rumour
1892,police surrounded building suspected charliehe...,553508098825261056,True,9077962,"[{""tweet_id"": 553509546602553344, ""tweet"": ""@N...",nonrumour


In [2]:
!pip install transformers
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_label = le.fit_transform(train_data['label'])
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=True)
train_text = train_data['main_tweet'].tolist()
encoding = tokenizer(train_text,truncation=True,max_length=100, padding=True, return_tensors="tf")


dev_data=preprocess_token(dev_data)
dev_text = dev_data['main_tweet'].tolist()
dev_encode = tokenizer(dev_text,truncation=True,max_length=100, padding=True, return_tensors="tf")
dev_label = le.transform(dev_data['label'])




In [3]:
model = TFAutoModel.from_pretrained(model_name, num_labels=2)

print(encoding["input_ids"].shape)

def create_model():
    input_ids = tf.keras.layers.Input(shape=(100,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(100,), name='attention_mask', dtype='int32')

    embeddings = model(input_ids, attention_mask=mask)[0]
    X = tf.keras.layers.GlobalMaxPool1D()(embeddings)  # reduce tensor dimensionality
    X = tf.keras.layers.BatchNormalization()(X)
    X = tf.keras.layers.Dense(128, activation='relu')(X)
    X = tf.keras.layers.Dropout(0.1)(X)
    y = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(X)

    tfmodel = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
    return tfmodel

tfmodel = create_model()
optimizer = tf.keras.optimizers.Adam(3e-5)
# optimizer = tf.keras.optimizers.Adam(2e-5)
tfmodel.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = tfmodel.fit({"input_ids": encoding["input_ids"], 'attention_mask': encoding["attention_mask"]}, \
                      train_label,\
                      validation_data=({"input_ids": dev_encode["input_ids"], 'attention_mask': dev_encode["attention_mask"]}, dev_label),\
                      epochs=10)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


(1567, 100)


In [4]:
# tfmodel.save("model3.h5")

# dev_data=preprocess_token(dev_data)

# dev_text = dev_data['main_tweet'].tolist()
# dev_encode = tokenizer(dev_text,truncation=True,max_length=100, padding=True, return_tensors="tf")
# dev_label = le.transform(dev_data['label'])


# optimizer = tf.keras.optimizers.Adam(2e-5)
# tfmodel.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])



# tfmodel.evaluate({"input_ids": dev_encode["input_ids"], 'attention_mask': dev_encode["attention_mask"]}, dev_label)

In [5]:
dev_data=preprocess_token(dev_data)

dev_text = dev_data['main_tweet'].tolist()
dev_encode = tokenizer(dev_text,truncation=True,max_length=100, padding=True, return_tensors="tf")
dev_label = le.transform(dev_data['label'])

test_data=preprocess_token(test_data)

test_text = test_data['main_tweet'].tolist()
test_encode = tokenizer(test_text,truncation=True,max_length=100, padding=True, return_tensors="tf")

In [6]:
# #### To run for Bert+2e-5

# tfmodel.load_weights('bert-weight.h5')
# optimizer = tf.keras.optimizers.Adam(2e-5)
# tfmodel.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# tfmodel.evaluate({"input_ids": dev_encode["input_ids"], 'attention_mask': dev_encode["attention_mask"]}, dev_label)

# ##create Bert for test 2e5
# prediction = tfmodel.predict({"input_ids": test_encode["input_ids"], 'attention_mask': test_encode["attention_mask"]})
# prediction = (prediction > 0.5).astype("int32")
# prediction = np.ndarray.flatten(prediction)
# pd.DataFrame({"Predicted":  prediction}).to_csv('Bert+2e5-submit.csv', index_label="Id")



In [7]:
tfmodel.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                           

In [8]:
# #create Bert for 2e5 for dev
# predictionxx = tfmodel.predict({"input_ids": dev_encode["input_ids"], 'attention_mask': dev_encode["attention_mask"]})
# predictionxx = (predictionxx > 0.5).astype("int32")
# predictionxx = np.ndarray.flatten(predictionxx)
# print("Bert+2e5 on dev set")
# print(classification_report(dev_label, predictionxx, digits=4))

Bert+2e5 on dev set
              precision    recall  f1-score   support

           0     0.9663    0.9571    0.9617       420
           1     0.8487    0.8783    0.8632       115

    accuracy                         0.9402       535
   macro avg     0.9075    0.9177    0.9125       535
weighted avg     0.9411    0.9402    0.9406       535



In [9]:
#### To run for Bert+3e-5

# tfmodel.load_weights('model3.h5')
optimizer = tf.keras.optimizers.Adam(3e-5)
tfmodel.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
tfmodel.evaluate({"input_ids": dev_encode["input_ids"], 'attention_mask': dev_encode["attention_mask"]}, dev_label)

##create Bert for test 3e5
predictionz = tfmodel.predict({"input_ids": test_encode["input_ids"], 'attention_mask': test_encode["attention_mask"]})
predictionz = (predictionz > 0.5).astype("int32")
predictionz = np.ndarray.flatten(predictionz)
pd.DataFrame({"Predicted":  predictionz}).to_csv('Bert+3e5-submit.csv', index_label="Id")



In [10]:
# #create Bert for 2e5 for dev
# predictionzz = tfmodel.predict({"input_ids": dev_encode["input_ids"], 'attention_mask': dev_encode["attention_mask"]})
# predictionzz = (predictionzz > 0.5).astype("int32")
# predictionzz = np.ndarray.flatten(predictionzz)
# print("Bert+2e5 on dev set")
# print(classification_report(dev_label, predictionzz, digits=4))

Bert+3e5 on dev set
              precision    recall  f1-score   support

           0     0.9669    0.9738    0.9703       420
           1     0.9018    0.8783    0.8899       115

    accuracy                         0.9533       535
   macro avg     0.9343    0.9260    0.9301       535
weighted avg     0.9529    0.9533    0.9530       535

