# Check the availbility of GPU

In [1]:
import tensorflow as tf
print(len(tf.config.list_physical_devices('GPU')))
import torch
print(torch.cuda.is_available())

1
True


# Download datasets

In [2]:
!gdown --id 1SRB7w6x_6oVUOzJihlYA5T2VR8u0UJyd
!gdown --id 1zs91kg3MO6FNkmtHFo1bqOF2Iy1F1b4y
!gdown --id 165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd

Downloading...
From: https://drive.google.com/uc?id=1SRB7w6x_6oVUOzJihlYA5T2VR8u0UJyd
To: /content/Twitter_train.csv
100% 261k/261k [00:00<00:00, 122MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zs91kg3MO6FNkmtHFo1bqOF2Iy1F1b4y
To: /content/Twitter_test.csv
100% 84.4k/84.4k [00:00<00:00, 84.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd
To: /content/Arabic_stop_words.txt
100% 6.48k/6.48k [00:00<00:00, 8.89MB/s]


In [3]:
!pip install pyarabic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 11.0 MB/s 
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.15


In [4]:
import pyarabic.araby as ar

# import Stemmer
import functools, operator

import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

## Load Dataset

In [5]:
import pandas as pd
df = pd.read_csv("Twitter_train.csv")
seed=42

## Arabic stop words

In [6]:
arabic_stop_words=[]
with open ('Arabic_stop_words.txt',encoding='utf-8') as f :
  for i in f.readlines() :
    arabic_stop_words.append(i)
    arabic_stop_words[-1]=arabic_stop_words[-1][:-1]


In [7]:
!pip install farasapy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [8]:
import numpy as np
import pandas as pd
import re

#============= Read CSV and apply data preperation =============#


def data_preprocessing (data_frame):
  # clean-up: remove #tags, http links and special symbols
  data_frame['tweet']= data_frame['tweet'].apply(lambda x: x[2:-2])
  data_frame['tweet']= data_frame['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'[@|#]\S*', '', x))
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'"+', '', x))

  # Remove arabic signs
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+', '', x))

  # Remove repeated letters like "الللللللللللللللله" to "الله"
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: x[0:2] + ''.join([x[i] for i in range(2, len(x)) if x[i]!=x[i-1] or x[i]!=x[i-2]]))

  # remove stop words
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: '' if x in arabic_stop_words else x)

  from nltk.stem.isri import ISRIStemmer
  df['tweet']=df['tweet'].apply(lambda x:ISRIStemmer().stem(x))

  return data_frame


In [None]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# st =  Stemmer.Stemmer('arabic')
import string,emoji
def data_cleaning (text):
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"https\S+", "", text)
  text = re.sub(r'\s+', ' ', text)
  text = re.sub("(\s\d+)","",text)
  text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
  text = re.sub("\d+", " ", text)
  text = ar.strip_tashkeel(text)
  text = ar.strip_tatweel(text)
  text = text.replace("#", " ");
  text = text.replace("@", " ");
  text = text.replace("_", " ");
  translator = str.maketrans('', '', string.punctuation)
  text = text.translate(translator)
  em = text
  em_split_emoji = emoji.get_emoji_regexp().split(em)
  em_split_whitespace = [substr.split() for substr in em_split_emoji]
  em_split = functools.reduce(operator.concat, em_split_whitespace)
  text = " ".join(em_split)
  text = re.sub(r'(.)\1+', r'\1', text)
  # text_stem = " ".join([st.stemWord(i) for i in text.split()])
  # text = text +" "+ text_stem
  text = text.replace("آ", "ا")
  text = text.replace("إ", "ا")
  text = text.replace("أ", "ا")
  text = text.replace("ؤ", "و")
  text = text.replace("ئ", "ي")

  return text

In [None]:
df['tweet']=df['tweet'].apply(lambda x: data_cleaning(x))
df=data_preprocessing(df)
df



Unnamed: 0,tweet,class
0,ان الذين يعيشون على الارض ليسوا ملايكة بل بشر ...,pos
1,كل سنة وانتم طيبين,pos
2,و انتهى مشوار الخواجة,neg
3,مش عارف ابتدى مذاكره منين,neg
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,neg
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,neu
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,neu
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,pos
2057,انت متناقض جدا يا صلاح,neg


In [None]:
!git clone https://github.com/aub-mind/arabert.git

fatal: destination path 'arabert' already exists and is not an empty directory.


In [None]:
from arabert.preprocess import ArabertPreprocessor

model_name = "aubmindlab/bert-large-arabertv02-twitter"
arabert_prep = ArabertPreprocessor(model_name=model_name)

df['tweet']=df['tweet'].apply(lambda x: arabert_prep.preprocess(x))


# text = "ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري"
# arabert_prep.preprocess(text)
# # "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري"

## Label Encoder

In [None]:
from sklearn import preprocessing
# Apply label encoding over the labels
lable_encoder = preprocessing.LabelEncoder()
encoded_labels =lable_encoder.fit_transform(df["class"])
df['class']=encoded_labels
df

Unnamed: 0,tweet,class
0,ان الذين يعيشون على الارض ليسوا ملايكة بل بشر ...,2
1,كل سنة وانتم طيبين,2
2,و انتهى مشوار الخواجة,0
3,مش عارف ابتدى مذاكره منين,0
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,2
2057,انت متناقض جدا يا صلاح,0


In [None]:
df['length']=df['tweet'].apply(lambda x:len(x.split(' ')))
df['length'].max()

39

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation=train_test_split(df['tweet'], df['class'], test_size=0.2, random_state=seed)
X_validation

1298    معناها مش المعني الظاهر معناها نفسها في الجنة ...
591                                                     ا
1318                                            هاكل اخير
1067                                لميس الحديدي تودع عبر
29                            انا نهي سنفورة القهوة اديرو
                              ...                        
1033    صباح اورد من احمد انا بحب سكس انا بحب الزمالك ...
674     حياة بالقرب من اله حياة مطمينة محفوفة بالتوفيق...
1771                         عليش نسال فيكن معناها ي بصله
322                                         شارع الجاردنز
1299                          كان نفسي اتولد مخلص ه تعليم
Name: tweet, Length: 412, dtype: object

# Trying some machine learning models

## TF_IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_ngram(n_gram,X_train,X_val):
    vectorizer = TfidfVectorizer(ngram_range=(n_gram,n_gram))
    x_train_vec = vectorizer.fit_transform(X_train)
    x_test_vec = vectorizer.transform(X_val)
    return x_train_vec,x_test_vec

In [None]:
# Applying tfidf with 1-gram, 2-gram and 3-gram
tfidf_1g_transformation_train,tfidf_1g_transformation_validation= tfidf_ngram(1,X_train,X_validation)
tfidf_2g_transformation_train,tfidf_2g_transformation_validation= tfidf_ngram(2,X_train,X_validation)

## Machine learning models

In [None]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

models=[SVC(),XGBClassifier(),RandomForestClassifier(),DecisionTreeClassifier(),LogisticRegression()]
for m in models :
    m.fit(tfidf_2g_transformation_train,y_train)
    print(m.score(tfidf_2g_transformation_train,y_train))
    print(m.score(tfidf_2g_transformation_validation,y_validation))

0.9848208864602307
0.3592233009708738
0.44383727990285365
0.34951456310679613
0.9848208864602307
0.3616504854368932
0.9848208864602307
0.3567961165048544
0.98421372191864
0.3786407766990291


# Trying to use some pre-trained models from hugging face website 

## Install transformers

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Model and Tokenizer initialization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#============= Initialize Arabic Bert =============#
#load your pre_trained model with all its weights
# model_name= 'aubmindlab/bert-base-arabertv02'
model_name='UBC-NLP/MARBERT' #top
# model_name='asafaya/bert-base-arabic'
# model_name='AraBERTv0.2-Twitter-base'
# model_name='aubmindlab/bert-large-arabertv2'
# model_name='aubmindlab/bert-base-arabertv02-twitter'
# model_name='aubmindlab/bert-large-arabertv02-twitter'
# model_name='aubmindlab/aragpt2-base'

# model_name='aubmindlab/bert-base-arabertv2'
tokenizer =AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
# model=AutoModel.from_pretrained(model_name,output_hidden_states=True)

Some weights of the model checkpoint at UBC-NLP/MARBERT were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at U

You can uncomment ay of the other models to get differnet accuraces

In [None]:
# Tokenize the sentences using bert tokenizer
df["bert_tokens"] = df.tweet.apply(lambda x: tokenizer(x).tokens())
df["bert_tokens_ids"] = df.tweet.apply(lambda x: tokenizer(x).tokens())
df["encoded"] = df.tweet.apply(lambda x: tokenizer.encode_plus(x,return_tensors='pt')['input_ids'])
df

Unnamed: 0,tweet,class,length,bert_tokens,bert_tokens_ids,encoded
0,ان الذين يعيشون على الارض ليسوا ملايكة بل بشر ...,2,25,"[[CLS], ان, الذين, يعيشون, على, الارض, ليسوا, ...","[[CLS], ان, الذين, يعيشون, على, الارض, ليسوا, ...","[[tensor(2), tensor(1946), tensor(2468), tenso..."
1,كل سنة وانتم طيبين,2,4,"[[CLS], كل, سنة, وانتم, طيبين, [SEP]]","[[CLS], كل, سنة, وانتم, طيبين, [SEP]]","[[tensor(2), tensor(2009), tensor(3171), tenso..."
2,و انتهى مشوار الخواجة,0,4,"[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]","[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]","[[tensor(2), tensor(144), tensor(7609), tensor..."
3,مش عارف ابتدى مذاكره منين,0,5,"[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]","[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]","[[tensor(2), tensor(2093), tensor(3323), tenso..."
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0,20,"[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[tensor(2), tensor(22181), tensor(1958), tens..."
...,...,...,...,...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1,10,"[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[tensor(2), tensor(4770), tensor(68899), tens..."
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1,11,"[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[tensor(2), tensor(39939), tensor(3715), tens..."
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,2,16,"[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #...","[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #...","[[tensor(2), tensor(3735), tensor(4880), tenso..."
2057,انت متناقض جدا يا صلاح,0,5,"[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]","[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]","[[tensor(2), tensor(2030), tensor(27008), tens..."


## Padding and attention mask

In [None]:
from keras_preprocessing.sequence import pad_sequences

# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
# In the original paper, the authors used a length of 512.
MAX_LEN = 64
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in df['bert_tokens']]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, encoded_labels,
                                                            random_state=seed, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=seed, test_size=0.1)
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 64

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)

## Set optimizer parameters

In [None]:
import torch.optim as optim

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]
# This variable contains all of the hyperparemeter information our training loop needs
# optimizer = optim.BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)
# optimizer = optim.AdamW(optimizer_grouped_parameters,lr=5e-6)
optimizer = optim.AdamW(optimizer_grouped_parameters,lr=.00001)

# Training

In [None]:
from tqdm import trange
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
t = []

# Store our loss and accuracy for plotting

train_loss_set = []

# Number of training epochs
epochs = 11

# Transfer the model to GPU
model.to("cuda")

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # Forward pass
    loss = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"), labels=b_labels.to("cuda"))["loss"]
    train_loss_set.append(loss.item())

    # Backward pass
    loss.backward()

    # Update parameters and take a step using the computed gradient
    optimizer.step()


    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    # batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"))

    # Move logits and labels to CPU
    logits = logits["logits"].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
  if (eval_accuracy/nb_eval_steps) > 0.77 :
    break

Epoch:   0%|          | 0/11 [00:00<?, ?it/s]

Train loss: 1.0185067386462772


Epoch:   9%|▉         | 1/11 [00:20<03:21, 20.18s/it]

Validation Accuracy: 0.6456473214285714
Train loss: 0.6970230464277596


Epoch:  18%|█▊        | 2/11 [00:39<02:59, 19.95s/it]

Validation Accuracy: 0.7589285714285714
Train loss: 0.45289019777857026


Epoch:  27%|██▋       | 3/11 [00:59<02:37, 19.67s/it]

Validation Accuracy: 0.7254464285714286
Train loss: 0.302816536919824


Epoch:  36%|███▋      | 4/11 [01:18<02:16, 19.52s/it]

Validation Accuracy: 0.7449776785714286
Train loss: 0.2339779290145841


Epoch:  45%|████▌     | 5/11 [01:38<01:57, 19.51s/it]

Validation Accuracy: 0.7215401785714286
Train loss: 0.1515888745157883


Epoch:  55%|█████▍    | 6/11 [01:57<01:37, 19.54s/it]

Validation Accuracy: 0.7511160714285714
Train loss: 0.09625541007724302


Epoch:  64%|██████▎   | 7/11 [02:17<01:18, 19.52s/it]

Validation Accuracy: 0.7059151785714286
Train loss: 0.07377016281002555


Epoch:  73%|███████▎  | 8/11 [02:36<00:58, 19.48s/it]

Validation Accuracy: 0.7215401785714286
Train loss: 0.06426888524458327


Epoch:  82%|████████▏ | 9/11 [02:55<00:38, 19.46s/it]

Validation Accuracy: 0.7254464285714286
Train loss: 0.0689966644577939


Epoch:  91%|█████████ | 10/11 [03:15<00:19, 19.47s/it]

Validation Accuracy: 0.7667410714285714
Train loss: 0.0621599309017946


Epoch: 100%|██████████| 11/11 [03:34<00:00, 19.54s/it]

Validation Accuracy: 0.7511160714285714





# Prepare testset with the same preprocessing

In [None]:
#============= Read CSV and apply data preperation =============#
df_submit = pd.read_csv("Twitter_test.csv")

df_submit["tweet"] = df_submit.tweet.apply(lambda x: data_cleaning(x))
df_submit=data_preprocessing(df_submit)

df_submit['tweet']=df_submit['tweet'].apply(lambda x: arabert_prep.preprocess(x))

# Tokenize the sentences using bert tokenizer
df_submit["bert_tokens"] = df_submit.tweet.apply(lambda x: tokenizer(x).tokens())



In [None]:
bert_tokens_submit = df_submit["bert_tokens"]

In [None]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 64
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids_submit = [tokenizer.convert_tokens_to_ids(x) for x in bert_tokens_submit]
# Pad our input tokens
input_ids_submit = pad_sequences(input_ids_submit, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks_submit = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_submit:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_submit.append(seq_mask)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
inputs_submit = torch.tensor(input_ids_submit)
masks_submit = torch.tensor(attention_masks_submit)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
batch_size = 64
submit_data = TensorDataset(inputs_submit, masks_submit)

# do not use shuffle, we need the preds to be in same order
submit_dataloader = DataLoader(submit_data, batch_size=batch_size)#, shuffle=True)

In [None]:
# Put the model in an evaluation state
model.eval()

# Transfer model to GPU
model.to("cuda")

outputs = []
for input, masks in submit_dataloader:
  torch.cuda.empty_cache() # empty the gpu memory

  # Transfer the batch to gpu
  input = input.to('cuda')
  masks = masks.to('cuda')

  # Run inference on the batch
  output = model(input, attention_mask=masks)["logits"]

  # Transfer the output to CPU again and convert to numpy
  output = output.cpu().detach().numpy()

  # Store the output in a list
  outputs.append(output)

# Concatenate all the lists within the list into one list
outputs = [x for y in outputs for x in y]

# Inverse transform the label encoding
pred_flat = np.argmax(outputs, axis=1).flatten()
output_labels = lable_encoder.inverse_transform(pred_flat)

In [None]:
submission = pd.DataFrame({"Id":np.arange(1, len(output_labels)+1), "class":output_labels})
# save (submission)
submission.to_csv("submission30.csv", index=False)