# Predictions by fine tunning transformers models

## 0. Imports and downloadings

In [31]:
# install useful libraries if necessary
#!pip install transformers
#!pip install datasets
#!pip install evaluate


# import libraries

import numpy as np
import pandas as pd
import transformers 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer 
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate

# 1. Pre-process data to the convenient form 

In [32]:
# load data
train_neg = [tweet[:-1] for tweet in open('train_neg.txt').readlines()]
train_pos = [tweet[:-1] for tweet in open('train_pos.txt').readlines()]

# put data into lists and assemble all of it
# then separate it with a train test split
X, y = train_neg + train_pos, [0]*100000 + [1]*100000
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# convert data into the good form in order to be readable by the tranformers library Trainer
dataset_train = Dataset.from_list([{'label' : y_train[i], 'text':X_train[i]} for i in range(len(y_train))])
dataset_test = Dataset.from_list([{'label' : y_test[i], 'text':X_test[i]} for i in range(len(y_test))])

In [None]:
# Here are all models from hugging face we tried to fine tune
# We had different results and the best model has been kept not commented
# all the results from these models are on the report

# The models : 
#MODEL = 'roberta-base'
#MODEL = 'distilbert-base-uncased-finetuned-sst-2-english'
#MODEL = 'bert-base-uncased'
#MODEL = 'ProsusAI/finbert'
MODEL = 'finiteautomata/bertweet-base-sentiment-analysis'

# We define the tokenizer and the pretrained model : 
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# The first step before is to tokenize our tweets data into a BERT form
# The function below do that
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# We tokenize our train and test datasets
tokenized_dataset_train = dataset_train.map(tokenize_function, batched=True)
tokenized_dataset_test = dataset_test.map(tokenize_function, batched=True)

## 2. Build the model trainer

In [172]:
# Let's define the metrics to evalute the performance of our classifier
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [181]:
# Let's build this trainer

# The training parameters and where we save it
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")


# The trainer :
# It's an object from the transformers library that allows us to fine-tune BERT based models 
# with our own data. The training is very simple and optimized by the tranformers library.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train.shuffle(seed=6).select(range(5000)),
    eval_dataset=tokenized_dataset_test.shuffle(seed=6).select(range(500)),
    compute_metrics=compute_metrics,
)

In [182]:
# Now let's train our Trainer
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1875
  Number of trainable parameters = 134902275


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3961,0.335579,0.866
2,0.243,0.494954,0.858
3,0.1403,0.600331,0.88


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *

TrainOutput(global_step=1875, training_loss=0.2257679463704427, metrics={'train_runtime': 436.3492, 'train_samples_per_second': 34.376, 'train_steps_per_second': 4.297, 'total_flos': 986675316480000.0, 'train_loss': 0.2257679463704427, 'epoch': 3.0})

## 3. Submit results to AI crowd

In [183]:
# load test data and tokenize it 
TEST = [tweet[:-1] for tweet in open('test_data.txt').readlines()]
XX, yy = TEST, [0]*10000 
DATASET = Dataset.from_list([{'label' : yy[i], 'text':XX[i]} for i in range(len(yy))])
TOK_DATASET = DATASET.map(tokenize_function, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

In [184]:
# We make predictions on this test data using our fine-tuned model
pred = trainer.predict(TOK_DATASET)
pred_label = (pred.predictions[:,1]>pred.predictions[:,0]).astype(int)
classification = 2*pred_label-1

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10000
  Batch size = 8


In [186]:
# We convert these predictions to a csv file to submit it to AIcrowd
DF = pd.DataFrame.from_dict({'Id': range(1, 10001), 'Prediction': classification.tolist()})
DF.to_csv('submission_BERT.csv', index = False)