In [1]:
import numpy as np
import pandas as pd

In [2]:
#pip install datasets

In [3]:
# Balance the clusters

def balance_df(df1, df2, random_state = 42):
    df = (df1, df2)
    lenght = (len(df[0]), len(df[1]))
    idx = np.argmin([lenght[0], lenght[1]])
    return pd.concat([df[idx], df[1-idx].sample(lenght[idx], random_state=random_state)], axis = 0)

In [4]:
# Remove regulars expressions

import re
import string
def custom_preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('u.s.', 'usa', text)
    text = re.sub('can\'t', 'cant', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    return text

In [5]:
fake_news = pd.read_csv('Fake.csv')
true_news = pd.read_csv('True.csv')

fake_news['target'] = 0
true_news['target'] = 1

news = balance_df(fake_news, true_news)
news['text'] = news['text'].apply(custom_preprocessor)
news['title'] = news['title'].apply(custom_preprocessor)

In [6]:
from sklearn.model_selection import train_test_split

feature = 'title'
target = 'target'

X = news[feature]
y = news[target]
X_train,X_test,y_train,y_test = train_test_split(X, y,random_state=42,test_size=0.2)
X_train,X_val,y_train,y_val = train_test_split(X_train, y_train,random_state=42,test_size=0.2)

In [7]:
from sklearn.model_selection import train_test_split

feature = 'title'
target = 'target'

X = news[[feature, target]]

X_train, X_val = train_test_split(X,random_state=42,test_size=0.2)
X_train,X_test = train_test_split(X_train,random_state=42,test_size=0.2)

In [8]:
# Save preprocessed set for future use.

X_train.to_csv('train.csv', index=False)
X_val.to_csv('validation.csv', index=False)
X_test.to_csv('test.csv', index=False)

In [9]:
import datasets

In [10]:
raw_datasets = datasets.load_dataset('csv', data_files={'train': 'train.csv',
                                                        'validation': 'validation.csv',
                                                        'test': 'test.csv'})

Using custom data configuration default-b735a9fe911e6231


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/jovyan/.cache/huggingface/datasets/csv/default-b735a9fe911e6231/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/csv/default-b735a9fe911e6231/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23. Subsequent calls will reuse this data.


In [11]:
raw_datasets['train']['title'][0:10]

['comey to testify to senate panel in public session',
 'u n  condemns anti gay crackdowns in egypt  azerbaijan  indonesia',
 ' watch  the daily show epically destroys fox news for blatant racism',
 'trump scrambles to convince americans he can handle puerto rico crisis',
 'turkey should follow west s lead on rights  author orhan pamuk',
 'malaysia s ruling party unites behind najib as election looms',
 'abc news reports  las vegas massacre suspect s hard drive is missing from his laptop',
 'trumpdom  the curious world of trump s foreign policy explained',
 'wow  what john kasich just asked cruz and trump to do proves he s got an ego the size of texas',
 'breaking  trump hits back at rep john lewis who declared trump s presidency  illegitimate ']

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [13]:
#model_name = "roberta-fake-news"
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)#, local_files_only=True)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, local_files_only=True)

In [15]:
def tokenize_function(examples):
    return tokenizer(examples[feature], padding='max_length', truncation=True, max_length = 50)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [16]:
tokenized_datasets['train']['input_ids'][0]

[101,
 2369,
 1996,
 11002,
 2129,
 8112,
 3488,
 2000,
 4652,
 3056,
 2591,
 3036,
 3841,
 12879,
 24108,
 5134,
 2013,
 19273,
 4409,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [17]:
tokenized_datasets['train']['title'][0:10]

['comey to testify to senate panel in public session',
 'u n  condemns anti gay crackdowns in egypt  azerbaijan  indonesia',
 ' watch  the daily show epically destroys fox news for blatant racism',
 'trump scrambles to convince americans he can handle puerto rico crisis',
 'turkey should follow west s lead on rights  author orhan pamuk',
 'malaysia s ruling party unites behind najib as election looms',
 'abc news reports  las vegas massacre suspect s hard drive is missing from his laptop',
 'trumpdom  the curious world of trump s foreign policy explained',
 'wow  what john kasich just asked cruz and trump to do proves he s got an ego the size of texas',
 'breaking  trump hits back at rep john lewis who declared trump s presidency  illegitimate ']

In [18]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoConfig
#config = AutoConfig.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)

2021-07-19 20:05:56.764444: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-19 20:05:56.764495: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-07-19 20:05:58.026441: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-19 20:05:58.026483: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-19 20:05:58.026506: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (3383a395dced): /proc/driver/nvidia/version does not exist
2021-07-19 20:05:58.026679: I tensorflow/core/platform/cpu_featu

In [19]:
model

<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification at 0x7fe0d232e1f0>

In [20]:
full_train_dataset = tokenized_datasets['train']
full_eval_dataset = tokenized_datasets['validation']
full_test_dataset = tokenized_datasets['test']
full_test_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'target', 'title'],
    num_rows: 6854
})

In [21]:
total_len = len(full_train_dataset) + len(full_eval_dataset) + len(full_test_dataset)
total_len

42834

In [22]:
len(full_test_dataset)/total_len

0.16001307372647897

In [23]:
TOTAL_RECORDS = len(full_test_dataset)# 4000
SEED = 3456

In [24]:
train_dataset = tokenized_datasets['train'].shuffle(seed=SEED).select(range(round(len(full_train_dataset)/total_len*TOTAL_RECORDS)))
eval_dataset = tokenized_datasets['validation'].shuffle(seed=SEED).select(range(round(len(full_eval_dataset)/total_len*TOTAL_RECORDS)))
test_dataset = tokenized_datasets['test']#.shuffle(seed=SEED).select(range(round(len(full_test_dataset)/total_len*TOTAL_RECORDS)))

In [25]:
train_dataset.remove_columns([feature]).with_format("tensorflow")[0]

{'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])>,
 'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([  101, 27885,  3630, 25171, 24247,  6398,  2003,  3718,  2013,
         8398,  2811,  3034,  8398,  5176,  3036,  2000,  3288,  2032,
         2067,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])>,
 'target': <tf.Tensor: shape=(), dtype=int64, numpy=0>}

In [26]:
test_dataset.remove_columns([feature]).with_format("tensorflow")[0]

{'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])>,
 'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([  101,  5095,  2305,  2444,  1055,  2695, 17331, 16181,  2001,
         3599,  1996,  4756,  2057,  2734,  2678,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])>,
 'target': <tf.Tensor: shape=(), dtype=int64, numpy=0>}

In [27]:
tf_train_dataset = train_dataset.remove_columns([feature]).with_format("tensorflow")
tf_eval_dataset = eval_dataset.remove_columns([feature]).with_format("tensorflow")
#tf_test_dataset = test_dataset.remove_columns([feature,"target"]).with_format("tensorflow")
tf_test_dataset = test_dataset.remove_columns([feature]).with_format("tensorflow")

In [28]:
tf_train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'target'],
    num_rows: 4386
})

In [29]:
tf_test_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'target'],
    num_rows: 6854
})

In [30]:
BATCH_SIZE = 8

Convert everything in big tensor and use from_tensor_slices method so that data can be fed into the network.

In [31]:
train_features = {x: tf_train_dataset[x].to_tensor() for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset[target]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(BATCH_SIZE)

In [32]:
eval_features = {x: tf_eval_dataset[x].to_tensor() for x in tokenizer.model_input_names}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset[target]))
eval_tf_dataset = eval_tf_dataset.batch(BATCH_SIZE)

In [39]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    #loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    #metrics=tf.metrics.SparseCategoricalAccuracy(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=tf.keras.metrics.BinaryAccuracy()
)

In [38]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  1538      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________


In [40]:
RUN_MODEL = False

In [41]:
if RUN_MODEL:
    model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=5, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
if RUN_MODEL:
    model.save_pretrained("my_model_title_BinaryCrossentropy")

In [43]:
test_features = {x: tf_test_dataset[x].to_tensor() for x in tokenizer.model_input_names}
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, tf_test_dataset[target]))
test_tf_dataset = test_tf_dataset.batch(BATCH_SIZE)

In [44]:
#test_features = {x: tf_test_dataset[x].to_tensor() for x in tokenizer.model_input_names}
#test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features))#, tf_test_dataset[target]))
##test_tf_dataset = test_tf_dataset.shuffle(seed=SEED).select(range(round(len(full_test_dataset)/total_len*TOTAL_RECORDS)))

In [52]:
prediction_model_name = "my_model_title_BinaryCrossentropy"
model = TFAutoModelForSequenceClassification.from_pretrained(prediction_model_name, num_labels = 2)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    #loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    #metrics=tf.metrics.SparseCategoricalAccuracy(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=tf.keras.metrics.BinaryAccuracy()
)

Some layers from the model checkpoint at my_model_title_BinaryCrossentropy were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at my_model_title_BinaryCrossentropy and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
#full_test_dataset = tokenized_datasets['test']
#full_test_dataset

In [47]:
#test_set = full_test_dataset.remove_columns([target]).with_format("tensorflow")
#test_set

In [48]:
#test_features = {x: test_set[x].to_tensor() for x in tokenizer.model_input_names}
#test_tf_dataset = tf.data.Dataset.from_tensor_slices(test_features)
#test_tf_dataset

In [49]:
#test_features

In [53]:
prediction = model.evaluate(test_tf_dataset)
prediction



[0.2927630841732025, 0.9002771973609924]

In [54]:
prediction = model.predict(test_tf_dataset)



In [68]:
prediction

TFSequenceClassifierOutput(loss=None, logits=array([[-0.00221144, -0.32367894],
       [ 0.18790331,  0.353192  ],
       [-0.01083675, -0.08599666],
       ...,
       [ 0.4887334 ,  0.60828316],
       [ 1.0843945 ,  0.99636275],
       [ 0.7886805 ,  0.6986307 ]], dtype=float32), hidden_states=None, attentions=None)

In [75]:
def inv_logit(p):
    if p > 0:
        return 1. / (1. + np.exp(-p))
    elif p <= 0:
        np.exp(p) / (1 + np.exp(p))
    else:
        raise ValueError
        
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

TypeError: op needs to be an Operation: 1.0

In [83]:
result = pd.DataFrame(prediction.logits, columns = ['Fake', 'True'])
result.to_csv('result_title_BinaryCrossentropy.csv', index=False)
result

In [57]:
#result = pd.read_csv('')

In [58]:
solution = pd.DataFrame(np.argmax(prediction.logits, axis=1), columns = ['predict'])
solution

Unnamed: 0,predict
0,0
1,1
2,0
3,0
4,0
...,...
6849,1
6850,1
6851,1
6852,0


In [85]:
result.describe()

Unnamed: 0,Fake,True,False
count,6854.0,6854.0,6854.0
mean,0.570201,1.0,1.0
std,0.536992,0.0,0.0
min,-0.614777,1.0,1.0
25%,0.075886,1.0,1.0
50%,0.478896,1.0,1.0
75%,1.04854,1.0,1.0
max,1.837305,1.0,1.0


In [59]:
solution.describe()

Unnamed: 0,predict
count,6854.0
mean,0.526116
std,0.499354
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [60]:
test_join_solution = X_test.reset_index().join(solution)
test_join_solution['delta'] = test_join_solution['predict'] - test_join_solution['target']

In [61]:
test_join_solution.groupby(['delta']).count()

Unnamed: 0_level_0,index,title,target,predict
delta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,2087,2087,2087,2087
0,2479,2479,2479,2479
1,2288,2288,2288,2288


In [62]:
df_confusion = pd.crosstab(test_join_solution['target'], test_join_solution['predict'])

In [63]:
df_confusion

predict,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1161,2288
1,2087,1318


In [64]:
test_accuracy = 1 - (df_confusion[0][1] + df_confusion[1][0]) / df_confusion.sum().sum()
test_accuracy

0.36168660636124894