In [11]:
from os import listdir as ls
import pandas as pd
import json
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers
import politifact_func as functions

import numpy as np

In [12]:
f = "/home/DAVIDSON/brwiedenbeck/public/NLP/politifact.json"

In [28]:
statements, values = functions.json_loader(f)

In [14]:
X_train, X_test, y_train, y_test = functions.process_data(statements, values, 6)

In [15]:
model = "https://tfhub.dev/google/nnlm-en-dim50/2"

In [16]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [17]:
hub_layer = hub.KerasLayer(embed, input_shape=[], dtype=tf.string, trainable=True)

In [18]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Reshape((1, 512)))
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(6))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, 512)               256797824 
_________________________________________________________________
reshape_1 (Reshape)          (None, 1, 512)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               328192    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              129000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                

In [20]:
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [21]:
batch_size=1024
epochs=10

In [22]:
model.fit(x=X_train, y=y_train,
          validation_split=0.1,
          batch_size=batch_size,
          epochs=epochs
         learning_rate=0.00001)
          

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1551db144ca0>

In [23]:
train_score = model.evaluate(X_train, y_train, verbose=1)
print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
test_score = model.evaluate(X_test, y_test, verbose=1)
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

Train loss: 0.652886688709259
Train accuracy: 0.902251660823822
Test loss: 5.5085768699646
Test accuracy: 0.28409358859062195


In [None]:
dimension_size=100
vocabulary_size=100_000
max_length=40

In [None]:
tokenizer = Tokenizer(num_words=vocabulary_size)

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(statements, values, test_size=0.2)

In [None]:
tokenizer.fit_on_texts(statements)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:

X_train = pad_sequences(X_train, dtype="float")
X_test = pad_sequences(X_test, dtype="float")

In [None]:
glove_embedded = "glove.6B.300d.txt" 


def getting_glove_embedding(filename): 
    file = open(filename,'r')
    lines = file.readlines()
    embedding = dict()
    for line in lines:
        parts = line.split()
        embedding[parts[0]] = np.asarray(parts[1:],dtype='float32')
        
    return(embedding)
    

def embedding_weights(vocab,raw_embedding):
    
    vocab_size = len(vocab)+1
    weight_matrix = np.zeros((vocab_size,300)) #each word with 300 dimensions
    
    for word , i in vocab.items():
        vector = raw_embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
            
    return(weight_matrix)
            
        
raw_embedding = getting_glove_embedding(glove_embedded)
print("<<<<<<< Raw Embedding Loaded >>>>>>>")
embedding_vectors = embedding_weights(tokenizer.word_index,raw_embedding)

In [None]:
embedding_layer = tf.keras.layers.Embedding(len(tokenizer.word_index)+1,300, weights=[embedding_vectors])

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dense(6))

model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
batch_size=64
epochs=10

In [None]:
classes=6
y_train = keras.utils.to_categorical(y_train, classes)
y_test = keras.utils.to_categorical(y_test, classes)

In [None]:
model.fit(x=X_train, y=y_train,
          validation_split=0.1,
          batch_size=batch_size,
          epochs=epochs)
          

In [None]:
train_score = model.evaluate(X_train, y_train, verbose=1)
print('Train loss:', train_score[0])
print('Train accuracy:', train_score[1])
test_score = model.evaluate(X_test, y_test, verbose=1)
print('Test loss:', test_score[0])
print('Test accuracy:', test_score[1])

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [33]:
def preprocess_function(examples):
    return tokenizer(examples, truncation=True)

In [34]:
preprocess_function(statements[0])

{'input_ids': [101, 2198, 19186, 4941, 10528, 28548, 2005, 2945, 1000, 2040, 2020, 2069, 1999, 10528, 2138, 1997, 2966, 11727, 2027, 2481, 1005, 1056, 3477, 1012, 1000, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [35]:
tokenized_statements = preprocess_function(statements)

In [36]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [37]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [38]:
import numpy as np


def compute_metrics(eval_pred):
     predictions, labels = eval_pred
     predictions = np.argmax(predictions, axis=1)
     return accuracy.compute(predictions=predictions, references=labels)

In [41]:
id2label = {0:"pants-fire",1: "false", 2: "mostly-false",3 : "half-true", 4: "mostly-true", 5: "true" }
label2id = {"pants-fire":0 ,"false": 1, "mostly-false":2, "half-true":3, "mostly-true":4, "true":5 }

In [59]:
X_train, X_test, y_train, y_test = train_test_split(tokenized_statements, values, test_size=0.2)

ValueError: Found input variables with inconsistent numbers of samples: [2, 21152]

In [63]:
# X_train = preprocess_function(X_train)
X_test = preprocess_function(X_test)

In [44]:
from transformers import create_optimizer
>>> import tensorflow as tf

>>> batch_size = 16
>>> num_epochs = 5
>>> batches_per_epoch = len(X_train) // batch_size
>>> total_train_steps = int(batches_per_epoch * num_epochs)
>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [47]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
     "distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id
)

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_23', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [52]:
import tensorflow as tf

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [67]:
type(X_train)

transformers.tokenization_utils_base.BatchEncoding

In [94]:
tf_train_set = model.prepare_tf_dataset(
     ds,
     shuffle=True,
     batch_size=16,
     collate_fn=data_collator,
 )

# tf_validation_set = model.prepare_tf_dataset(
#     ds,
#      shuffle=False,
#      batch_size=16,
#      collate_fn=data_collator,
#  )

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

In [68]:
model.fit(x=X_train, validation_data=X_test, batch_size=16, epochs=3)

ValueError: Failed to find data adapter that can handle input: <class 'transformers.tokenization_utils_base.BatchEncoding'>, <class 'NoneType'>

In [71]:
import pandas as pd

In [74]:
len(statements)

21152

In [79]:
data= pd.DataFrame(statements)

In [81]:
data['y'] = values

In [89]:
ds = datasets.Dataset.from_pandas(data)

  return cls(pa.Table.from_pandas(*args, **kwargs))


In [96]:
ds['0']

['John McCain opposed bankruptcy protections for families "who were only in bankruptcy because of medical expenses they couldn\'t pay."',
 '"Bennie Thompson actively cheer-led riots in the ’90s."',
 'Says\xa0Maggie Hassan was "out of state on 30 days over the last three months."',
 '"BUSTED: CDC Inflated COVID Numbers, Accused of Violating Federal Law"',
 '"I\'m the only (Republican)\xa0candidate that has actually reduced the size of government."',
 '"There are actually only 30 countries that practice birthright citizenship."',
 '"My husband and I have never gotten a penny of money from the farm."',
 '"If you go strictly by the numbers, crime is down across the board. Last year we had a 10 percent decrease in the most serious crimes."',
 '"The American people say, don\'t touch Social Security, don\'t touch Medicare, don\'t cut defense. That\'s 84 percent of the federal budget."',
 '"Since 1978, CEO compensation rose over 1,000% and only 11.9% for average workers."',
 'Says her accompli

In [87]:
import datasets

In [93]:
type(ds['0'])

list