<a href="https://colab.research.google.com/github/Hesamalian/MultilingualBert/blob/master/MBERTclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a work to utilize Keras libraries (https://github.com/vzhou842/cnn-from-scratch) and Simple Transformers (https://github.com/ThilinaRajapakse/simpletransformers) to demonstrate the difference between CNN and Multilingual BERT text classifiers.

The former one is based on the transformer packages by HuggingFace 🤗 (https://github.com/huggingface/transformers).

---


> This is a task of binary sentiment analysis for a french sample dataset ("example.csv") that can be found in https://drive.google.com/open?id=1IyxGimLEytKoAIkbr0wdl_bTdGCuz7uH .

---





Install the packages for tensorflow, keras, torch, sklearn and dataframe

In [102]:
import sys
import csv
from nltk.corpus import stopwords
from collections import defaultdict
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import re
import json
from collections import Counter
import nltk
import nltk
nltk.download('wordnet')
import warnings
import itertools
import numpy as np 
import pandas as pd 

import torch
import tensorflow
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model

#import os
#os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3"

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from scipy.stats import randint as sp_randint

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
#warnings.filterwarnings("ignore", category=DeprecationWarning)
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#config = tensorflow.ConfigProto( device_count = {'GPU': 3 , 'CPU': 3} ) 
#sess = tensorflow.Session(config=config) 
#keras.backend.set_session(sess)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Install the transformers


In [38]:
!pip install pytorch-pretrained-bert
!pip install simpletransformers
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from simpletransformers.classification import ClassificationModel



Check if gpu works fine

In [41]:
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images (batch x height x width x channel). Sum of ten runs.
CPU (s):
0.180292369999961
GPU (s):
0.1798919330001354
GPU speedup over CPU: 1x


Preprocessing for French Dataset

In [0]:
def clean_str(string):
    string=string.replace(',',' ')
    string=string.replace('!',' ')
    string=string.replace('.',' ')
    string=string.replace('\'',' ')
    string = re.sub(r"[^A-Za-z(),!?\'\`èéêëôòóœàáâç]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string=string.replace('é','e')
    string=string.replace('è','e')
    string=string.replace('ê','e')
    string=string.replace('ç','c')
    string=string.replace('ć','c')
    string=string.replace('č','c')
    string=string.replace('ö','o')
    string=string.replace('ô','o')
    string=string.replace('ò','o')
    string=string.replace('ó','o')
    string=string.replace('á','a')
    string=string.replace('á','a')
    string=string.replace('â','a')
    newstring=[]
    for a in string.split():
        if len(a)>2:
            #newstring.append(a)
            newstring.append(nltk.stem.WordNetLemmatizer().lemmatize(a))
    string=' '.join(newstring)
    return string.strip()

Load package data for cnn

In [0]:
def load_data_and_labels_cnn(filename):
    df = pd.read_csv(filename,error_bad_lines=False,na_values=" ").fillna('nan')
    data = df[['polarity','statutnull']]
    # random_subset = data.sample(n=5000)
    # print(random_subset.head())
    # random_subset.to_csv('example.csv')
    data['sentiment']=['pos' if (x=='4') else 'neg' for x in data['polarity']]
    data['statutnull']= [x.lower() for x in data['statutnull']]
    data['statutnull'] = data['statutnull'].apply((lambda x: re.sub('[^A-Za-z(),!?\'\`èéêëôòóœàáâç]',' ',x)))
    pd.set_option('display.max_colwidth',-1)
    data[:5]
    titles=data['statutnull'].values
    x_text = [clean_str(sent) for sent in titles]
    x_text = [s.split(" ") for s in x_text]
    y_input1=pd.get_dummies(data['sentiment']).values
    y_input=y_input1
    xnew=[]
    ynew=[]
    for n,a in enumerate(x_text):
        if len(a)>4 and len(a)<100:
            xnew.append(a)
            ynew.append(y_input[n])
    return [xnew, ynew]
  
def pad_sentences_cnn(sentences, padding_word="<PAD/>"):
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

def build_vocab_cnn(sentences):
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

def build_input_data_cnn(sentences,labels, vocabulary):

    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x,y]

def load_data_cnn(filename):

    # Load and preprocess data
    sentences,labels = load_data_and_labels_cnn(filename)
    sentences_padded = pad_sentences_cnn(sentences)
    vocabulary, vocabulary_inv = build_vocab_cnn(sentences_padded)
    x,y = build_input_data_cnn(sentences_padded,labels, vocabulary)
    return [x,y, vocabulary, vocabulary_inv]

Load the data for BERT

In [0]:
def load_data_and_labels_BERT(filename):
    df = pd.read_csv(filename,error_bad_lines=False,na_values=" ").fillna('nan')
    data = df[['polarity','statutnull']]
    # random_subset = data.sample(n=5000)
    # print(random_subset.head())
    # random_subset.to_csv('example.csv')
    data['sentiment']=['pos' if (x=='4') else 'neg' for x in data['polarity']]
    data['statutnull']= [x.lower() for x in data['statutnull']]
    data['statutnull'] = data['statutnull'].apply((lambda x: re.sub('[^A-Za-z(),!?\'\`èéêëôòóœàáâç]',' ',x)))
    pd.set_option('display.max_colwidth',-1)
    data[:5]
    titles=data['statutnull'].values
    x_text = [clean_str(sent) for sent in titles]
    x_text = [s.split(" ") for s in x_text]
    le = LabelEncoder()
    y_input1=le.fit_transform(data['sentiment'].values)
    #y_input1=pd.get_dummies(data['sentiment']).values
    y_input=y_input1
    xnew=[]
    ynew=[]
    for n,a in enumerate(x_text):
        if len(a)>4 and len(a)<100:
            xnew.append(' '.join(a))
            ynew.append(y_input[n])
    return [xnew, ynew,le]

def load_data_BERT(filename):
    # Load and preprocess data
    sentences,labels,le = load_data_and_labels_BERT(filename)
    # sentences_padded = pad_sentences(sentences)
    # vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    # x,y = build_input_data(sentences_padded,labels, vocabulary)
    return [sentences,labels,le]

Install the google drive to get the sample input

In [52]:
!pip install PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [53]:
print('Loading data')
downloaded = drive.CreateFile({'id':"1IyxGimLEytKoAIkbr0wdl_bTdGCuz7uH"}) 
downloaded.GetContentFile('example.csv')
path2input='example.csv'

Loading data


Get the input for CNN and run the example

In [57]:
print (path2input)
X,Y, vocabulary, vocabulary_inv = load_data_cnn(path2input)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

example.csv


In [58]:
sequence_length = X.shape[1]
vocabulary_size = len(vocabulary_inv)
embedding_dim = 300
filter_sizes = [1,2,3,4,5,6]
num_filters = 512
drop = 0.5

epochs = 20
batch_size = 30

print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[4], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_5 = Conv2D(num_filters, kernel_size=(filter_sizes[5], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)
maxpool_3 = MaxPool2D(pool_size=(sequence_length - filter_sizes[3] + 1, 1), strides=(1,1), padding='valid')(conv_3)
maxpool_4 = MaxPool2D(pool_size=(sequence_length - filter_sizes[4] + 1, 1), strides=(1,1), padding='valid')(conv_4)
maxpool_5 = MaxPool2D(pool_size=(sequence_length - filter_sizes[5] + 1, 1), strides=(1,1), padding='valid')(conv_5)
concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3, maxpool_4,maxpool_5])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(2, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

print("Traning Model...")
model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_test, Y_test))  # starts training


Creating Model...
Traning Model...
Train on 3247 samples, validate on 812 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.62685, saving model to weights.001-0.6268.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.62685 to 0.66749, saving model to weights.002-0.6675.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.66749 to 0.67365, saving model to weights.003-0.6736.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.67365 to 0.67488, saving model to weights.004-0.6749.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.67488 to 0.68719, saving model to weights.005-0.6872.hdf5
Epoch 6/20

Epoch 00006: val_acc improved from 0.68719 to 0.69951, saving model to weights.006-0.6995.hdf5
Epoch 7/20

Epoch 00007: val_acc improved from 0.69951 to 0.71921, saving model to weights.007-0.7192.hdf5
Epoch 8/20

Epoch 00008: val_acc did not improve from 0.71921
Epoch 9/20

Epoch 00009: val_acc did not improve from 0.71921
Epoch 10/20

Epoch 00010: val_acc did not improv

<keras.callbacks.History at 0x7f7c61c2b828>

Get the data for BERT and run the example

In [93]:
print (path2input)
X,Y,le= load_data_BERT(path2input)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

example.csv


Data and model preparation

In [0]:
train_df = pd.DataFrame({'text':X_train,'labels':Y_train})
eval_df = pd.DataFrame({'text':X_test,'labels':Y_test})

# Create a ClassificationModel
#model = ClassificationModel('distilbert', 'distilbert-base-multilingual-cased', num_labels=2, args={'reprocess_input_data': True, 'overwrite_output_dir': True,'fp16': False,'fp16': False,"train_batch_size": 8,"num_train_epochs": 5})
#model = ClassificationModel('camembert', 'camembert-base', num_labels=2, args={'reprocess_input_data': True, 'overwrite_output_dir': True,'num_train_epochs': 3,'fp16': False,"train_batch_size": 8})
model = ClassificationModel('bert', 'bert-base-multilingual-uncased', num_labels=2, args={'reprocess_input_data': True, 'overwrite_output_dir': True,'num_train_epochs': 3,'fp16': False,"train_batch_size": 8})



In [109]:
# Train the model
model.train_model(train_df,eval_df)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=3247), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=406, style=ProgressStyle(description_…

Running loss: 0.674116



Running loss: 0.784267

HBox(children=(IntProgress(value=0, description='Current iteration', max=406, style=ProgressStyle(description_…

Running loss: 0.650750

HBox(children=(IntProgress(value=0, description='Current iteration', max=406, style=ProgressStyle(description_…

Running loss: 0.425696Training of bert model complete. Saved to outputs/.


The result for MBERT

In [110]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=sklearn.metrics.accuracy_score)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=812), HTML(value='')))

HBox(children=(IntProgress(value=0, max=102), HTML(value='')))

{'mcc': 0.4670797631240588, 'tp': 272, 'tn': 324, 'fp': 96, 'fn': 120, 'acc': 0.7339901477832512, 'eval_loss': 0.5925741793186057}
