<a href="https://colab.research.google.com/github/Mogreine29/HandsOnAI_2/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import re
import string
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/My\ Drive/Challenge2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1aIAK0Qi-pofkSLi3xijkAex-JrSW_tn8/Challenge2


In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/Challenge2/fake_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Challenge2/fake_test.csv')

In [5]:
df_train = df_train.drop(['Unnamed: 0', 'target_name'], axis = 1)
df_test = df_test.drop(['Unnamed: 0', 'target_name'], axis = 1)

In [6]:
df_train.drop([1136, 1180, 1294, 1317, 1362, 1429], inplace = True )

In [7]:
df_real = pd.read_csv("real_news.csv")
df_fake = pd.read_csv("fake_news.csv")
df_real = df_real.drop(['site', 'url', 'title'], axis = 1)
df_fake = df_fake.drop(['site', 'url', 'title'], axis = 1)
df_real['label'] = 0
df_fake['label'] = 1
df_real.rename(columns = {'text':'data'}, inplace = True)
df_fake.rename(columns = {'text':'data'}, inplace = True)
df_train = df_train.append(df_real, ignore_index = True)
df_train = df_train.append(df_fake, ignore_index = True)

In [8]:
STOPWORDS = set(stopwords.words('french'))

def cleaner(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    text = ' '.join(word for word in text.split() if word not in STOPWORDS)   
    return text

In [9]:
df_train['data'] = df_train['data'].apply(cleaner)
df_test['data'] = df_test['data'].apply(cleaner)

In [49]:
df_train = df_train.append(df_test, ignore_index=True)
shuffled = df_train.sample(frac=1).reset_index()
shuffled = shuffled.drop(['index'], axis = 1)

In [12]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [13]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [14]:
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
def preprocess_function(text):
    return tokenizer(text, truncation=True, padding=True)

In [50]:
label = shuffled.pop('label')
text = shuffled.pop('data')

In [51]:
label_arr = label.to_numpy()
text_arr = text.to_numpy()

In [52]:
for i in range(len(text_arr)):
  text_arr[i] = preprocess_function(text_arr[i]).input_ids
text_arr

array([list([5, 79, 21, 21104, 10, 1952, 6120, 190, 12885, 7637, 794, 22631, 2309, 700, 2877, 6705, 405, 21, 265, 5874, 262, 724, 7145, 6700, 290, 3089, 112, 7833, 1484, 7453, 104, 179, 6254, 1952, 1819, 313, 78, 1263, 1883, 5696, 12984, 17384, 1890, 2322, 3881, 1653, 208, 1194, 12984, 151, 209, 1241, 250, 5665, 3708, 25887, 1713, 6551, 20394, 10, 3173, 1729, 6551, 22944, 1033, 254, 838, 33, 1263, 1708, 5744, 90, 651, 1205, 3720, 35, 1691, 3650, 10, 55, 1352, 6165, 21, 5738, 375, 284, 132, 3885, 3650, 10, 55, 724, 12038, 15928, 7756, 3832, 343, 321, 7898, 11041, 35, 28278, 3754, 1187, 1358, 199, 7082, 5814, 2329, 1018, 589, 5814, 174, 209, 947, 771, 9307, 1248, 472, 3376, 11667, 15928, 1189, 18014, 12266, 13726, 17966, 3832, 343, 321, 7898, 11041, 35, 3754, 1187, 1358, 488, 281, 1352, 4520, 199, 5814, 589, 1018, 183, 3832, 599, 236, 7873, 5096, 2449, 1380, 5305, 14141, 771, 2254, 3896, 3650, 10, 55, 771, 7883, 1203, 2142, 5430, 2254, 2353, 1883, 771, 9157, 570, 10812, 472, 1535, 1352, 

In [57]:
def split_list(a_list, p):
    split = int(len(a_list)//(1/(1-p)))
    return a_list[:split], a_list[split:]
x_train, x_test = split_list(text_arr, 0.2)
y_train, y_test = split_list(label_arr, 0.2)
x_train, x_val = split_list(x_train, 0.25)
y_train, y_val = split_list(y_train, 0.25)

In [58]:
maxlen = 200  # Only consider the first 200 words of each movie review
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

13247 Training sequences
4416 Validation sequences


In [62]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
vocab_size = 210000

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [63]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
)

Epoch 1/2
Epoch 2/2
