In [None]:
#https://towardsdatascience.com/multi-label-classification-using-bert-roberta-xlnet-xlm-and-distilbert-with-simple-transformers-b3e0cda12ce5

In [1]:
%%capture
!pip install simpletransformers
!pip install emoji

In [2]:
# import some necessary sub libraries
import pandas as pd
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import os
import tarfile
import re
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt     
#from google.colab import drive
#drive.mount('/content/drive')
from google.colab import files

def dict_to_df(d):
    df=pd.DataFrame(d.items())
    df.set_index(0, inplace=True)
    return df

def pack_model(model_path='',file_name=''):
  files = [files for root, dirs, files in os.walk(model_path)][0]
  with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
    for file in files:
      f.add(f'{model_path}/{file}')

def unpack_model(model_name=''):
  tar = tarfile.open(f"{model_name}.tar.gz", "r:gz")
  tar.extractall()
  tar.close()

def prepare_results(result_dict):
  result_dict["f1_macro"] = result_dict["f1"]["f1_macro"]
  result_dict["f1_micro"] = result_dict["f1"]["f1_micro"]
  result_dict["f1_weighted"] = result_dict["f1"]["f1_weighted"]

  result_dict["precision_macro"] = result_dict["precision"]["precision_macro"]
  result_dict["precision_micro"] = result_dict["precision"]["precision_micro"]
  result_dict["precision_weighted"] = result_dict["precision"]["precision_weighted"]

  result_dict["recall_macro"] = result_dict["recall"]["recall_macro"]
  result_dict["recall_micro"] = result_dict["recall"]["recall_micro"]
  result_dict["recall_weighted"] = result_dict["recall"]["recall_weighted"]

  del result_dict["recall"]
  del result_dict["precision"]
  del result_dict["f1"]

  p_per_class = result_dict["prfs"][0]
  r_per_class = result_dict["prfs"][1]
  f_per_class = result_dict["prfs"][2]
  sample_per_class = result_dict["prfs"][3]

  i = 0
  for number in p_per_class:
    result_dict["precision_class_" + str(i)] = number
    i = i + 1
  
  i = 0
  for number in r_per_class:
    result_dict["recall_class_" + str(i)] = number
    i = i + 1
  
  i = 0
  for number in f_per_class:
    result_dict["f1_score_class_" + str(i)] = number
    i = i + 1
  
  i = 0
  for number in sample_per_class:
    result_dict["sample_class_" + str(i)] = number
    i = i + 1
  
  del result_dict["prfs"]

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_fscore_support, balanced_accuracy_score

def f1_multiclass(labels, preds):
  f1_score_info = {
      "f1_micro": f1_score(labels, preds, average='micro'),
      "f1_macro": f1_score(labels, preds, average='macro'),
      "f1_weighted": f1_score(labels, preds, average='weighted')
  }
  return f1_score_info

def p_multiclass(labels, preds):
  precision_info = {
      "precision_micro": precision_score(labels, preds, average='micro'),
      "precision_macro": precision_score(labels, preds, average='macro'),
      "precision_weighted": precision_score(labels, preds, average='weighted')
  }
  return precision_info

def recall_multiclass(labels, preds):
  recall_info = {
      "recall_micro": recall_score(labels, preds, average='micro'),
      "recall_macro": recall_score(labels, preds, average='macro'),
      "recall_weighted": recall_score(labels, preds, average='weighted')
  }
  return recall_info

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 17348303255548158969
 xla_global_id: -1, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 13266321408
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 17485820582257226359
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

In [3]:
# Read in data
train = pd.read_csv("tweets-train.csv")
test = pd.read_csv("tweets-test.csv")
val = pd.read_csv("tweets-valid.csv")

# convert labels: 0 = neutral, 1 = positive, 2 = negative
train["label"] = train["label"].apply(lambda x: 2 if x == -1 else (1 if x == 1 else 0))
test["label"] = test["label"].apply(lambda x: 2 if x == -1 else (1 if x == 1 else 0))
val["label"] = val["label"].apply(lambda x: 2 if x == -1 else (1 if x == 1 else 0))

# check for empty rows
train = train[train['label'].notna()]
test = test[test['label'].notna()]

train = train[["tweet", "label"]]
test = test[["tweet", "label"]]
val = val[["tweet", "label"]]

train.head()

Unnamed: 0,tweet,label
0,ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे...,2
1,सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोल...,2
2,उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिका...,2
3,आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्...,1
4,बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...,2


In [None]:
print(len(train))
print(len(test))
print(len(val))

12114
2250
1500


In [4]:
# Concat the Validation- and Train-Set 
train = train.append(val)
len(train)

13614

In [5]:
corona_cleaned = pd.read_csv("Corona_NLP_train_translated_and_cleaned.csv")
corona_cleaned = corona_cleaned[["translated", "Sentiment"]]
corona_cleaned['Sentiment'] = corona_cleaned['Sentiment'].map({'Extremely Negative':2,"Negative":2,"Neutral":0,"Positive":1,'Extremely Positive':1})
corona_cleaned.rename(columns = {'translated':'tweet', 'Sentiment':'label'}, inplace = True)
corona_cleaned.head()

Unnamed: 0,tweet,label
0,कोविड 19 साथीच्या रोगामुळे प्रभावित लोक आणि सम...,1.0
1,कोरोनाव्हायरस ईकॉमर्सच्या मर्यादांमुळे स्टॉकिं...,0.0
2,एमएफएस टॉयलेट पेपर अत्यंत किमतीत विकत आहेत ते ...,2.0
3,सुपरमार्केट पेक्षा जास्त स्वच्छता जोडणे आवडते ...,1.0
4,तुम्ही घाबरून अन्न आणि पुरवठा खरेदी करण्याआधी ...,2.0


In [6]:
corona_cleaned["label"].value_counts()

1.0    17999
2.0    15364
0.0     7558
Name: label, dtype: int64

In [7]:
train = train.append(corona_cleaned)
len(train)

54539

In [8]:
data = train

In [9]:
data.label.value_counts()

1.0    22537
2.0    19902
0.0    12096
Name: label, dtype: int64

In [10]:
# set the Hyperparameters for the model

from sklearn.model_selection import KFold
import torch
cuda_available = torch.cuda.is_available()

#models to use:
#bert-base-multilingual-cased
#ai4bharat/indic-bert (albert)
#xlm-roberta-base
#l3cube-pune/marathi-bert
#l3cube-pune/marathi-albert-v2
#l3cube-pune/marathi-roberta

model_name = "bert-base-multilingual-cased"
model_type = "bert"
label_count = 3

# nun zum Modell
# Hyperparameter
train_args ={"reprocess_input_data": True,
             "learning_rate": 2e-5,
             #"evaluate_during_training": True, #in this case not, as we use Kfold Cross Validation
             "num_train_epochs": 3,
             "overwrite_output_dir":True,
             "train_batch_size": 16,
             "eval_batch_size": 32,
             #"max_seq_length": 128,
             #"manual_seed": 42 # don't need this here as the random state is set with the Kfold Cross Validation
             }

In [None]:
# Set the names for creating and downloading the files
ansatz = "41K"
trained_model = "mBERT"

In [None]:
# prepare cross validation
n=5
target_names = ['0, Neutral', '1, Positive', '2, Negative']
kf = KFold(n_splits=n, random_state=1, shuffle=True)

index_count = 0

for train_index, val_index in kf.split(data):
  print(F"Loop Number: {index_count+1}")
  index_count += 1
    # splitting Dataframe (dataset not included)
  train_df = data.iloc[train_index]
  val_df = data.iloc[val_index]
    # Defining Model
  model = ClassificationModel(model_type, model_name, num_labels=label_count,args=train_args,use_cuda=cuda_available)
    # train the model
  model.train_model(train_df)
    # validate the model and save results
  result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, recall=recall_multiclass, precision=p_multiclass, bal_acc=balanced_accuracy_score, f1=f1_multiclass, prfs=precision_recall_fscore_support)
  prepare_results(result)
  result_df = dict_to_df(result)
  result_df = result_df.sort_values(0)
  result_df.to_csv(F"kfold_result_{trained_model}_{ansatz}{index_count}.csv")
  
    # Print Classification Report and create Confusion Matrix
  print(classification_report(np.argmax(model_outputs, axis = 1), test.label.values,target_names=target_names))
  cm = confusion_matrix(np.argmax(model_outputs, axis = 1), test.label.values)
  ax = plt.subplot()
  cm_plot = sns.heatmap(cm, annot=True, fmt='g', ax=ax,cmap='Greens');
  # labels, title and ticks
  ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
  ax.set_title(F'Confusion Matrix of Run {index_count}'); 
  ax.xaxis.set_ticklabels(['Neutral', 'Positive', 'Negative']); ax.yaxis.set_ticklabels(['Neutral', 'Positive', 'Negative']);
  cm_plot.figure.savefig(F"confusion_matrix_{trained_model}_{ansatz}{index_count}.png")

Loop Number: 1


Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at l3cube-pune/marathi-roberta were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-roberta and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weigh

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

  f"use_multiprocessing automatically disabled as {model_type}"
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/681 [00:00<?, ?it/s]

In [None]:
import time
# sleep to allow to download more files at once

In [None]:
# download all result files
files.download(F"kfold_result_{trained_model}_{ansatz}1.csv")

In [None]:
time.sleep(5)

In [None]:
files.download(F"kfold_result_{trained_model}_{ansatz}2.csv")

In [None]:
time.sleep(5)

In [None]:
files.download(F"kfold_result_{trained_model}_{ansatz}3.csv")

In [None]:
time.sleep(5)

In [None]:
files.download(F"kfold_result_{trained_model}_{ansatz}4.csv")

In [None]:
time.sleep(5)

In [None]:
files.download(F"kfold_result_{trained_model}_{ansatz}5.csv")

In [None]:
file1 = pd.read_csv(F"kfold_result_{trained_model}_{ansatz}1.csv")
file2 = pd.read_csv(F"kfold_result_{trained_model}_{ansatz}2.csv")
file3 = pd.read_csv(F"kfold_result_{trained_model}_{ansatz}3.csv")
file4 = pd.read_csv(F"kfold_result_{trained_model}_{ansatz}4.csv")
file5 = pd.read_csv(F"kfold_result_{trained_model}_{ansatz}5.csv")

file2.head()

In [None]:
from functools import reduce

names = file1["0"].tolist()
results = reduce(lambda a, b: a.add(b, fill_value=0), [file1["1"], file2["1"], file3["1"], file4["1"], file5["1"]]).tolist()

results_divided = []
for i in results:
  res = i/5
  results_divided.append(res)

d = {'Metrics':names,'Overall Results':results_divided}
overall_df = pd.DataFrame(d)
overall_df


In [None]:
overall_df.to_csv(F"overall_kfold_result_{trained_model}_{ansatz}.csv", encoding="utf-8")

In [None]:
files.download(F"overall_kfold_result_{trained_model}_{ansatz}.csv")

In [None]:
time.sleep(5)
files.download(F"confusion_matrix_{trained_model}_{ansatz}1.png")
time.sleep(5)
files.download(F"confusion_matrix_{trained_model}_{ansatz}2.png")
time.sleep(5)
files.download(F"confusion_matrix_{trained_model}_{ansatz}3.png")
time.sleep(5)
files.download(F"confusion_matrix_{trained_model}_{ansatz}4.png")
time.sleep(5)
files.download(F"confusion_matrix_{trained_model}_{ansatz}5.png")