In [1]:
from transformers import BertModel,BertTokenizer,FlaubertTokenizer, FlaubertModel,AutoTokenizer, BertForSequenceClassification , FlaubertForSequenceClassification
from transformers.modeling_utils import SequenceSummary
from tensorboardX import SummaryWriter
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pack_padded_sequence
import sys
import re
from models import BasicBertForClassification
from train import train_noFeatures
from preprocessing.text_preprocessing import TextPreprocessing
from preprocessing.feature_enginering import FeaturesExtraction
from transformers import  AutoModel
 

  re.IGNORECASE | re.VERBOSE)
  re.VERBOSE | re.IGNORECASE)
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


In [2]:
class FocalLoss2(nn.Module):
    """
    This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in
    'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)'
        Focal_Loss= -1*alpha*(1-pt)*log(pt)

    Params:
        :param num_class:
        :param alpha: (tensor) 3D or 4D the scalar factor for this criterion
        :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more
                        focus on hard misclassified example
        :param smooth: (float,double) smooth value when cross entropy
        :param balance_index: (int) balance class index, should be specific when alpha is float
        :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch.
    """

    def __init__(self, num_class, alpha=None, gamma=1, balance_index=-1, smooth=None, size_average=False):
        super(FocalLoss2, self).__init__()
        self.num_class = num_class
        self.alpha = alpha
        self.gamma = gamma
        self.smooth = smooth
        self.size_average = size_average

        if self.alpha is None:
            self.alpha = torch.ones(self.num_class, 1)
        elif isinstance(self.alpha, (list, np.ndarray)):
            assert len(self.alpha) == self.num_class
            self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1)
            self.alpha = self.alpha / self.alpha.sum()
        elif isinstance(self.alpha, float):
            alpha = torch.ones(self.num_class, 1)
            alpha = alpha * (1 - self.alpha)
            alpha[balance_index] = self.alpha
            self.alpha = alpha
        else:
            raise TypeError('Not support alpha type')

        if self.smooth is not None:
            if self.smooth < 0 or self.smooth > 1.0:
                raise ValueError('smooth value should be in [0,1]')

    def forward(self, logit, target):

        #logit = F.softmax(input, dim=1)
        logit=torch.nn.functional.softmax(logit,dim=1)
        if logit.dim() > 2:
            # N,C,d1,d2 -> N,C,m (m=d1*d2*...)
            logit = logit.view(logit.size(0), logit.size(1), -1)
            logit = logit.permute(0, 2, 1).contiguous()
            logit = logit.view(-1, logit.size(-1))
        target = target.view(-1, 1)

        # N = input.size(0)
        # alpha = torch.ones(N, self.num_class)
        # alpha = alpha * (1 - self.alpha)
        # alpha = alpha.scatter_(1, target.long(), self.alpha)
        epsilon = 1e-6
        alpha = self.alpha.to(logit.device)

        idx = target.cpu().long()

        one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_()
        one_hot_key = one_hot_key.scatter_(1, idx, 1)
        one_hot_key = one_hot_key.to(logit.device)

        if self.smooth:
            one_hot_key = torch.clamp(one_hot_key, self.smooth / (self.num_class - 1), 1.0 - self.smooth)
        pt = (one_hot_key * logit).sum(1) + epsilon
        logpt = pt.log()

        gamma = self.gamma

        alpha = alpha[idx]
        loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt

        if self.size_average:
            loss = loss.mean()
        else:
            loss = loss.sum()
        return loss


In [3]:
writer = SummaryWriter('runs/test')

In [4]:
import pandas as pd
df = pd.read_csv("/data/aboumada/Data/3_Datasets/full_df_noFeatures_Preapred.csv")
#df=df[df.classe2 != 'Poubelle']
len(df)

12826

In [5]:
ls /data/aboumada/Data/3_Datasets

13events_df_noFeatures.csv
Untitled.ipynb
Untitled1.ipynb
all_unlabeled.txt
anonym_final_en.csv
anonym_final_en_1.csv
[0m[38;5;27mback[0m/
corpus2.csv
[38;5;34mcorpus_2non_annotated.csv[0m*
corpus_PSY_Features
corpus_PSY_Features.csv
corpus_PSY_Preapred_Features.csv
corpus_PSY_Prepared_Features.csv
[38;5;34mcorpus_annote_RepPer - corpus_annote_RepPer.csv[0m*
data_clas
data_clas_export.pkl
data_lm
data_lm_export.pkl
df_train.csv
duplicates
duplicates.csv
final_en_1.csv
final_en_2.csv
full_df_Features.csv
full_df_noFeatures.csv
full_df_noFeatures2.csv
full_df_noFeatures_Preapred.csv
[38;5;27mg_data[0m/
g_data.csv
[38;5;27mmodels[0m/
moumene testing 2.ipynb
original_df_test.csv
original_df_train.csv
processd_final_en_1.csv
[38;5;34mracism-testing-testing_multi_label.csv[0m*
[38;5;34mtous-final_sansdoublon.xml[0m*
[38;5;34mtrain_multi_label.csv[0m*


In [6]:
text_preprocessing = TextPreprocessing(df,"TEXT")
text_preprocessing.fit_transform()

In [7]:
df

Unnamed: 0,CAT,ID,event,TEXT,nb_retweets,nb_likes,num_tweets,following,followers,likes,lists,CAT3,CAT2,text_clean,processed_text
0,Degats-Materiels,1.054066e+18,Aude,"Inondations dans l'Aude:dégâts ""de l'ordre de ...",-0.785795,-0.894291,0.956674,1.387893,0.349577,1.078412,-0.657592,Message-InfoUrgent,Message-Utilisable,"inondations dans l'aude:dégâts ""de l'ordre de ...",Inondations dans l Aude dégâts de l ordre de n...
1,Avertissement-conseil,9.267310e+17,Autre,Département Hérault en vigilance Jaune pluie ️...,1.647024,1.586924,-0.469193,1.005712,0.814903,1.892032,1.030431,Message-InfoUrgent,Message-Utilisable,département hérault en vigilance jaune pluie ️...,Département Hérault en vigilance Jaune pluie ️...
2,Avertissement-conseil,1.054363e+18,Aude,[#CodesCourtage] Inondations tragiques dans l’...,-0.785795,-0.894291,-0.829161,0.098955,0.104824,-0.498565,-0.657592,Message-InfoUrgent,Message-Utilisable,[#codescourtage] inondations tragiques dans l’...,CodesCourtage Inondations tragiques dans l Au...
3,AutresMessages,9.587054e+17,Autre,Merci à @FDSEA77 pour ces photos prises par #d...,1.755725,1.741995,-0.165393,0.863903,1.192329,0.537415,1.030431,Message-InfoNonUrgent,Message-Utilisable,merci à @fdseanumnum pour ces photos prises pa...,Merci à pour ces photos prises par drones qui ...
4,Avertissement-conseil,8.014572e+17,Autre,"[️ALERTE MÉTÉO⚠️] Vigilance #orange ""orages, p...",1.647024,1.476086,0.560304,0.570291,1.373599,1.748246,-0.657592,Message-InfoUrgent,Message-Utilisable,"[️ale e météo⚠️] vigilance #orange ""orages, pl...",️ALERTE MÉTÉO⚠️ Vigilance orange orages pluie...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12821,Message-NonUtilisable,7.000477e+17,Ulrika,J'aime une vidéo @YouTube de @davidlpokemon - ...,-0.785795,-0.894291,-0.255538,-1.511671,-3.149651,-0.687065,-0.657592,Message-NonUtilisable,Message-NonUtilisable,j'aime une vidéo @youtube de @davidlpokemon - ...,J aime une vidéo de Ouverture de numero Boos...
12822,Message-NonUtilisable,6.980546e+17,Ulrika,Ce soir je dors en #Bretagne :-)\nje dors en B...,-0.785795,-0.894291,-0.858333,1.028186,-0.244243,-0.386106,1.589166,Message-NonUtilisable,Message-NonUtilisable,ce soir je dors en #bretagne :-) je dors en br...,Ce soir je dors en Bretagne je dors en Bretagn...
12823,Message-NonUtilisable,6.989351e+17,Ulrika,Tempête Ulrika sur le Bassin d'Arcachon Andern...,-0.785795,-0.894291,0.461589,1.187429,-0.015061,-0.138041,-0.657592,Message-NonUtilisable,Message-NonUtilisable,tempête ulrika sur le bassin d'arcachon andern...,Tempête Ulrika sur le Bassin d Arcachon Andern...
12824,Message-NonUtilisable,6.987924e+17,Ulrika,Mi è piaciuto un video di @YouTube: http://you...,-0.785795,-0.894291,-0.632697,1.381086,-0.168708,-0.173275,-0.657592,Message-NonUtilisable,Message-NonUtilisable,mi è piaciuto un video di @youtube: tempête...,Mi è piaciuto un video di Tempête en Mer d I...


In [8]:
from sklearn.model_selection import train_test_split

#events_test= ['EffondrementMarseille']

events_test= ['Bruno', 'Eleanor']

df_test = df[df.event.isin(events_test)]

df_train = df[~df.event.isin(events_test)]

#df_train , df_test = train_test_split(df,random_state=1, test_size=0.2)

#df_train=df_en[['text_clean','CAT']]

In [9]:
df_train['processed_text'].iloc[600]

'Nouveau risque d inondation en hongrie un village évacué'

In [10]:
def get_sentences_labels(df,text_column='text_clean',label_column='CAT',cat_labels=None):
    dic_cat_labels = cat_labels if cat_labels is not None else {x:value for x,value in enumerate(df[label_column].unique())}
    print(dic_cat_labels)
    dic_labels_to_cat = {value:x for x,value in dic_cat_labels.items() }
    #df[text_column]= df[text_column].map(lambda text_clean : re.sub('["#$%&()*+,-./:;<=>@[\]^_`{|}~\n\t’\']', '', text_clean))
    df2 = df[label_column].map(dic_labels_to_cat)
    sentences = df[text_column].values
    labels = df2.values.astype(int)
    return sentences,labels,dic_cat_labels

def custom_sentences_labels(df,dic_cat,dic_cat2,text_column='text_clean',label_column='CAT'):
    dic_cat_labe = dic_cat
    dic_cat_labe2 = dic_cat2
    #df['texte']= df['texte'].map(lambda text_clean : re.sub('["#$%&()*+,-./:;<=>@[\]^_`{|}~\n\t’\']', '', text_clean))
    dic_labels_to_cat = {v: k for k, v in dic_cat_labe.items()}
    dic_labels_to_cat2 = {v: k for k, v in dic_cat_labe2.items()}
    sentences = df[text_column].values
    df_cat = df['classe1'].map(dic_labels_to_cat)
    labels_CAT = df_cat.values.astype(int)
    df_cat2 = df['classe2'].map(dic_labels_to_cat2)
    labels_CAT2 = df_cat2.values.astype(int)

    return sentences,(labels_CAT,labels_CAT2),dic_cat_labe

dic_cat_labels_CAT = {0: 'Poubelle', 1: 'UsageDetourne', 2: 'UsageMedical'}
dic_cat_labels_CAT3 = {0: 'Poubelle', 1: 'opinionNegative', 2: 'opinionPositive',3:'sansOpinion-ou-mixte'}

sentences_train,labels_train,dic_cat_labels=get_sentences_labels(df_train,text_column='processed_text',label_column='CAT')
sentences_test,labels_test,dic_cat_labels=get_sentences_labels(df_test,text_column='processed_text',label_column='CAT',cat_labels=dic_cat_labels)

#sentences_train,labels_train,_=custom_sentences_labels(df_train,dic_cat_labels_CAT,dic_cat_labels_CAT3,text_column='texte',label_column='CAT')
#sentences_test,test_labels,_=custom_sentences_labels(df_test,dic_cat_labels_CAT,dic_cat_labels_CAT3,text_column='texte',label_column='CAT')

{0: 'Degats-Materiels', 1: 'Avertissement-conseil', 2: 'AutresMessages', 3: 'Message-NonUtilisable', 4: 'Soutiens', 5: 'Degats-Humains', 6: 'Critiques'}
{0: 'Degats-Materiels', 1: 'Avertissement-conseil', 2: 'AutresMessages', 3: 'Message-NonUtilisable', 4: 'Soutiens', 5: 'Degats-Humains', 6: 'Critiques'}


In [11]:
'''crisis_names = ['irma','bruno','aude','harvey','eleanor','corse-fione','beryl−guadeloupe','corse','egon','susanna','ulrika','reunion−berguitta','marseille','effondrementmarseille','guadeloupe','corse','immeuble','martinique','saint martin','berguitta']
crisis_scrap = ['marseille','bruno','crue', 'crues', 'aude', 'carcassonne', 'trèbes', 'trebes','corse', 'corsica', 'hautecorse', 'haute-corse','crue','béryl', 'beryl', 'guadeloupe', 'ondetropicale','réunion', 'reunion', 'lareunion', 'fakir', 'laréunion','réunion', 'reunion', 'lareunion',' berguitta',' laréunion','corse', 'fionn', 'corsica', 'ana','irma','ouraganIRMA', 'saintmartin', 'stmartin', 'saintbarthelemy', 'saintbarth', 'stbarth','harvey', 'martinique', 'guadeloupe','egon','ulrika', 'vendée','bretagne','susanna']
crisis_scrap=crisis_scrap+crisis_names
print(crisis_scrap)
for i in range(len(sentences_train)):
    big_regex = re.compile('|'.join(map(re.escape, crisis_scrap)))
    sentences_train[i] = big_regex.sub(" ", sentences_train[i])'''

'crisis_names = [\'irma\',\'bruno\',\'aude\',\'harvey\',\'eleanor\',\'corse-fione\',\'beryl−guadeloupe\',\'corse\',\'egon\',\'susanna\',\'ulrika\',\'reunion−berguitta\',\'marseille\',\'effondrementmarseille\',\'guadeloupe\',\'corse\',\'immeuble\',\'martinique\',\'saint martin\',\'berguitta\']\ncrisis_scrap = [\'marseille\',\'bruno\',\'crue\', \'crues\', \'aude\', \'carcassonne\', \'trèbes\', \'trebes\',\'corse\', \'corsica\', \'hautecorse\', \'haute-corse\',\'crue\',\'béryl\', \'beryl\', \'guadeloupe\', \'ondetropicale\',\'réunion\', \'reunion\', \'lareunion\', \'fakir\', \'laréunion\',\'réunion\', \'reunion\', \'lareunion\',\' berguitta\',\' laréunion\',\'corse\', \'fionn\', \'corsica\', \'ana\',\'irma\',\'ouraganIRMA\', \'saintmartin\', \'stmartin\', \'saintbarthelemy\', \'saintbarth\', \'stbarth\',\'harvey\', \'martinique\', \'guadeloupe\',\'egon\',\'ulrika\', \'vendée\',\'bretagne\',\'susanna\']\ncrisis_scrap=crisis_scrap+crisis_names\nprint(crisis_scrap)\nfor i in range(len(senten

In [12]:
from sklearn.model_selection import train_test_split
from bertInput import BertInput

bert_input= BertInput(AutoTokenizer.from_pretrained('flaubert-base-cased'))


X_train = bert_input.fit_transform(sentences_train)
X_test = bert_input.fit_transform(sentences_test)


In [13]:
len(X_train[1])

11386

In [14]:
features = ['num_tweets','following','followers','likes','lists','nb_retweets','nb_likes']

In [15]:
# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels,train_features,validation_features,train_masks,validation_masks = train_test_split(X_train[0], labels_train,df_train[features].astype(float).values.tolist(),X_train[1],random_state=1, test_size=0.2)
# Do the same for the masks.
#train_masks, validation_masks= train_test_split(,random_state=1, test_size=0.2)

test_inputs = X_test[0]
test_masks = X_test[1]
test_features = df_test[features].astype(float).values.tolist()
test_labels = labels_test

In [16]:
import torch 
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
test_inputs = torch.tensor(test_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)


train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)

train_features = torch.tensor(train_features)
validation_features = torch.tensor(validation_features)
test_features = torch.tensor(test_features)



In [17]:
print(len(train_labels))

9108


In [18]:
def get_label_callback(dataset,idx):
    return dataset[idx][3].item()

In [19]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs,train_masks,train_features,train_labels)
train_sampler = RandomSampler(train_data)
#train_sampler = ImbalancedDatasetSampler(train_data,callback_get_label=get_label_callback)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size,drop_last=True )

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs,validation_masks ,validation_features,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs,test_masks,test_features, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [20]:
print(len(test_data[0]))

4


In [21]:
#model =FlaubertBertForSequenceClassification.from_pretrained('bert_fine_tuned_all_fr',num_labels = 7)
#model.bert.embeddings.requires_grad = False

In [22]:
base_model = AutoModel.from_pretrained("moumeneb1/flaubert-base-cased-ecology_crisis")

In [23]:
model = BasicBertForClassification(base_model,7)
model.cuda()

BasicBertForClassification(
  (bert): FlaubertModel(
    (position_embeddings): Embedding(512, 768)
    (embeddings): Embedding(68729, 768, padding_idx=2)
    (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (attentions): ModuleList(
      (0): MultiHeadAttention(
        (q_lin): Linear(in_features=768, out_features=768, bias=True)
        (k_lin): Linear(in_features=768, out_features=768, bias=True)
        (v_lin): Linear(in_features=768, out_features=768, bias=True)
        (out_lin): Linear(in_features=768, out_features=768, bias=True)
      )
      (1): MultiHeadAttention(
        (q_lin): Linear(in_features=768, out_features=768, bias=True)
        (k_lin): Linear(in_features=768, out_features=768, bias=True)
        (v_lin): Linear(in_features=768, out_features=768, bias=True)
        (out_lin): Linear(in_features=768, out_features=768, bias=True)
      )
      (2): MultiHeadAttention(
        (q_lin): Linear(in_features=768, out_features=768, bias=Tr

In [24]:
from sklearn.metrics import f1_score,recall_score,precision_score

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Function to calculate the f1_score of our predictions vs labels
def flat_f1(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat, average='macro')

def flat_recall(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, pred_flat, average='macro')

def flat_precision(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, pred_flat, average='macro')

In [25]:
from transformers import AdamW,get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs 

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [26]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
import numpy as np
criterion = nn.CrossEntropyLoss()
train_noFeatures(model,train_dataloader,validation_dataloader,epochs,torch.device('cuda'),optimizer,scheduler,criterion,writer)
print("")
print("Training complete!")


Training...
  Batch    40  of    569.    Elapsed: 0:00:08.
  Batch    80  of    569.    Elapsed: 0:00:16.
  Batch   120  of    569.    Elapsed: 0:00:24.
  Batch   160  of    569.    Elapsed: 0:00:32.
  Batch   200  of    569.    Elapsed: 0:00:40.
  Batch   240  of    569.    Elapsed: 0:00:48.
  Batch   280  of    569.    Elapsed: 0:00:55.
  Batch   320  of    569.    Elapsed: 0:01:04.
  Batch   360  of    569.    Elapsed: 0:01:12.
  Batch   400  of    569.    Elapsed: 0:01:20.
  Batch   440  of    569.    Elapsed: 0:01:28.
  Batch   480  of    569.    Elapsed: 0:01:36.
  Batch   520  of    569.    Elapsed: 0:01:44.
  Batch   560  of    569.    Elapsed: 0:01:52.

  Average training loss: 0.62
  Training epcoh took: 0:01:54

Running Validation...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
save currently the best model to [tmp]
save model parameters to [tmp]


  Accuracy: 0.83
  F1: 0.62
  Recall: 0.63
  Precision: 0.63
  Validation took: 0:00:09

Training...
  Batch    40  of    569.    Elapsed: 0:00:08.
  Batch    80  of    569.    Elapsed: 0:00:16.
  Batch   120  of    569.    Elapsed: 0:00:24.
  Batch   160  of    569.    Elapsed: 0:00:32.
  Batch   200  of    569.    Elapsed: 0:00:41.
  Batch   240  of    569.    Elapsed: 0:00:49.
  Batch   280  of    569.    Elapsed: 0:00:57.
  Batch   320  of    569.    Elapsed: 0:01:06.
  Batch   360  of    569.    Elapsed: 0:01:14.
  Batch   400  of    569.    Elapsed: 0:01:22.
  Batch   440  of    569.    Elapsed: 0:01:30.
  Batch   480  of    569.    Elapsed: 0:01:38.
  Batch   520  of    569.    Elapsed: 0:01:46.
  Batch   560  of    569.    Elapsed: 0:01:54.

  Average training loss: 0.39
  Training epcoh took: 0:01:56

Running Validation...


save currently the best model to [tmp]
save model parameters to [tmp]


  Accuracy: 0.85
  F1: 0.65
  Recall: 0.65
  Precision: 0.67
  Validation took: 0:00:09

Training...
  Batch    40  of    569.    Elapsed: 0:00:08.
  Batch    80  of    569.    Elapsed: 0:00:16.
  Batch   120  of    569.    Elapsed: 0:00:24.
  Batch   160  of    569.    Elapsed: 0:00:32.
  Batch   200  of    569.    Elapsed: 0:00:41.
  Batch   240  of    569.    Elapsed: 0:00:49.
  Batch   280  of    569.    Elapsed: 0:00:57.
  Batch   320  of    569.    Elapsed: 0:01:05.
  Batch   360  of    569.    Elapsed: 0:01:13.
  Batch   400  of    569.    Elapsed: 0:01:21.
  Batch   440  of    569.    Elapsed: 0:01:29.
  Batch   480  of    569.    Elapsed: 0:01:37.
  Batch   520  of    569.    Elapsed: 0:01:45.
  Batch   560  of    569.    Elapsed: 0:01:53.

  Average training loss: 0.26
  Training epcoh took: 0:01:55

Running Validation...
  Accuracy: 0.84
  F1: 0.64
  Recall: 0.65
  Precision: 0.66
  Validation took: 0:00:07

Training...
  Batch    40  of    569.    Elapsed: 0:00:08.
  Batch 

In [28]:
writer.add_text('Train', 'This is an lstm')

In [29]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_dataloader)))

# Put model in evaluation mode
model.eval()


# Tracking variables 
predictions_cat,predictions_cat3,predictions_cat2 , true_labels_cat,true_labels_cat2  = [], [],[],[],[]

# Predict 
for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(torch.device("cuda")) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask,b_features,b_labels_cat = batch

    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model((b_input_ids,b_input_mask,b_features))
        logits_cat = outputs[0]

    # Move logits and labels to CPU
    logits_cat = logits_cat.detach().cpu().numpy()
    label_ids_cat = b_labels_cat.to('cpu').numpy()
    predictions_cat.extend(logits_cat)
    true_labels_cat.extend(label_ids_cat)
    

    



print('    DONE.')



Predicting labels for 90 test sentences...
    DONE.


In [30]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

pred_flat_cat = np.argmax(predictions_cat, axis=1)
true_labels_cat=[dic_cat_labels.get(x) for x in true_labels_cat]
pred_flat_cat = [dic_cat_labels.get(x) for x in pred_flat_cat]


cr= classification_report(true_labels_cat,pred_flat_cat,digits=4)
print(accuracy_score(pred_flat_cat,true_labels_cat))
print(cr)

0.86875
                       precision    recall  f1-score   support

       AutresMessages     0.0811    0.2000    0.1154        15
Avertissement-conseil     0.6983    0.8571    0.7696       189
       Degats-Humains     0.6000    0.7778    0.6774        27
     Degats-Materiels     0.4773    0.7000    0.5676        30
Message-NonUtilisable     0.9595    0.8861    0.9213      1176
             Soutiens     0.3333    0.6667    0.4444         3

             accuracy                         0.8688      1440
            macro avg     0.5249    0.6813    0.5826      1440
         weighted avg     0.8980    0.8688    0.8801      1440



In [31]:
model.save("Crisis_Binary_flaubert_base.pth")

save model parameters to [Crisis_Binary_flaubert_base.pth]


In [32]:
model = BasicBertForClassification.load("Pycho_sentiment_bert_adepted.pth")

FileNotFoundError: [Errno 2] No such file or directory: 'Pycho_sentiment_bert_adepted.pth'

In [None]:
tokenizer = ATokenizer.from_pretrained('bert-base-multilingual-cased')
sentences = df["texte"]
bert_input = BertInput(tokenizer)
sentences = bert_input.fit_transform(sentences)
input_ID = torch.tensor(sentences[0])
input_MASK = torch.tensor(sentences[1])
print(len(sentences))

input_ID = torch.tensor(sentences[0])
input_MASK = torch.tensor(sentences[1])

tensor_dataset = TensorDataset(
    input_ID, input_MASK)

dataloader = DataLoader(
    tensor_dataset, batch_size=1, shuffle=False, num_workers=4)

pred = []
for index, batch in enumerate(dataloader):
    output = model(batch)
    label_index = np.argmax(output[0].cpu().detach().numpy())
    print(index)
    pred.append(labels_dict.get(label_index))
df['prediction'] = pred

In [None]:
name_to_cat = {v:k for k,v in dic_cat_labels_CAT.items()}

In [None]:
liste = []
for _,row in df_test.iterrows():
    x_input = []
    x_input = berts_input([row['text_clean']],BertTokenizer.from_pretrained('bert-base-multilingual-cased'))
    x_input = torch.tensor(x_input)
    x_input = x_input.to(torch.device('cuda'))
    output = model(x_input)
    output[0].cpu
    output[0].cpu()
    pred_flat = np.argmax(output[0].cpu().detach().numpy())
    if pred_flat!=name_to_cat[row['CAT']]:
        print('Annotation human:',row['CAT'])
        print('Annotation Machine:',dic_cat_labels_CAT[pred_flat] )
        print(row['TEXT'])
        liste.append([row['TEXT'],row['CAT'],dic_cat_labels_CAT[pred_flat]])
        print('\n')


In [None]:
df_resultat = pd.DataFrame(liste,columns=['Text','Human annotation','Machine Annotation'])

In [None]:
df_resultat.to_csv('error.csv',index=False)

In [None]:
x_input = torch.tensor(x_input)


In [None]:
# Three classes
x_input = x_input.to(F)

In [None]:
output = model(x_input)

In [None]:
output[0].cpu

In [None]:
output[0].cpu()

In [None]:
pred_flat = np.argmax(output[0].cpu().detach().numpy())

In [None]:
pred_flat

In [None]:
x=df_test['text_clean'].values[0]

In [None]:
df

In [None]:
max_len = max(len(s) for s in sentences)

In [None]:
from models import BasicBertForClassification, BertFeaturesForSequenceClassification
from transformers import AutoTokenizer

# When you add a model or a domain to your app Just import your model and add the path to it
models_dic = {
    "crisis_binary": {
        "bert_base_cased": {
            "model": BasicBertForClassification,
            "path": "dqdsqdq",
        },
        "flaubert_base_cased": {
            "model": BasicBertForClassification,
            "path": "dqshdbjq"
        },
        "flaubert_base_features": {
            "model": BertFeaturesForSequenceClassification,
            "path": "dqshdbjq",
            "features": ['nbretweet', 'nblike']
        },
        "labels_dic": {
            0: 'Message-Utilisable',
            1: 'Message-NonUtilisable'
        }
    },
    "crisis_Three_Class": {
        "bert_base_cased": {
            "model": BasicBertForClassification,
            "path": "dqdsqdq"
        },
        "flaubert_base_cased": {
            "model": BasicBertForClassification,
            "path": "dqshdbjq"
        },
        "flaubert_base_features": {
            "model": BasicBertForClassification,
            "path": "dqshdbjq",
            "fatures": ["dqsdqsd", ""]
        },
        "labels_dic": {
            0: 'Message-InfoUrgent',
            1: 'Message-InfoNonUrgent',
            2: 'Message-NonUtilisable'}
    },
    "crisis_MultiClass": {
        "bert_base_cased": {
            "model": BasicBertForClassification,
            "path": "dqdsqdq"
        },
        "flaubert_base_cased": {
            "model": BasicBertForClassification,
            "path": "dqshdbjq"
        },
        "flaubert_base_features": {
            "model": BasicBertForClassification,
            "path": "dqshdbjq",
            "fatures": ["dqsdqsd", ""]
        },
        "labels_dic": {
            0: 'Degats-Materiels',
            1: 'Avertissement-conseil',
            2: 'AutresMessages',
            3: 'Message-NonUtilisable',
            4: 'Soutiens',
            5: 'Degats-Humains',
            6: 'Critiques'}
    },
    "psycho_sentiment": {
        "bert_base_cased": {
            "model": BasicBertForClassification,
            "path": "dqdsqdq"
        },
        "flaubert_adapted_features": {
            "model": BertFeaturesForSequenceClassification,
            "path": "flaubert_classification.pth",
            "tokenizer_base" : "flaubert-base-cased",
            "features" : ['nbretweet','nblike'],
        },
        "labels_dic":{
            0: 'opinionNegative', 
            1: 'sansOpinion-ou-mixte', 
            2: 'opinionPositive'}
    },
    "psycho_use": {
        "model": BasicBertForClassification,
        "path": ",qldks,qdl"
    }
}


def get_model(domain, model_name):
    model = models_dic[domain][model_name]["model"].load(
        models_dic[domain][model_name]["path"])

    if "features" in models_dic[domain][model_name]:
        features = models_dic[domain][model_name]["features"]
    else:
        features = []
    Tokenizer = AutoTokenizer.from_pretrained(
        models_dic[domain][model_name]["tokenizer_base"])

    return model, Tokenizer, models_dic[domain]["labels_dic"], features


In [None]:

featuresExtrator = FeaturesExtraction(df, "texte")
featuresExtrator.fit_transform()

# Preprocessing
text_preprocessing = TextPreprocessing(df, "texte")
text_preprocessing.fit_transform()

# Load model ,Tokenizer , labels_dict , features

model, tokenizer, labels_dict, features = get_model(
    "psycho_sentiment", "flaubert_adapted_features")

print(features)
# get text
sentences = df_test["processed_text"]
bert_input = BertInput(tokenizer)
sentences = bert_input.fit_transform(sentences)
input_ID = torch.tensor(sentences[0])
input_MASK = torch.tensor(sentences[1])
print(len(sentences))
if features:
    features_column = df_test[features].values.astype(float).tolist()
    features_column = torch.tensor(features_column)
    tensor_dataset = TensorDataset(input_ID,input_MASK,features_column)
else:
    tensor_dataset = TensorDataset(sentences)

dataloader = DataLoader(
    tensor_dataset, batch_size=1, shuffle=False, num_workers=4)



In [None]:

# Create an empty list
sample_list = []
# Iterate over sequence of numbers from 0 to 9
for i in range(10):
    # Append each number at the end of list
    sample_list.append(i)
sample_list

In [None]:
pred = []
for index,batch in enumerate(dataloader):
    output = model(batch)
    label_index = np.argmax(output[0].cpu().detach().numpy())
    pred.append(labels_dict.get(label_index))
    df_test['predictions']=pred

In [None]:

df_test

In [None]:
liste = []
for _,row in df_test.iterrows():
    x_input = []
    x_input = bert_input.fit_transform([row['processed_text']])
    input_ID = torch.tensor(x_input[0][0])
    input_MASK = torch.tensor(x_input[1][0])
    features_c = row[features].values.astype(float).tolist()
    #features_c = torch.tensor(features_column)
    output = model((input_ID,input_MASK,features_c,))
    output[0].cpu
    output[0].cpu()
    pred_flat = np.argmax(output[0].cpu().detach().numpy())
    print(pred_flat)

In [None]:
sentences_text= df_test['processed_text']
sentences_text.iloc[120]

In [None]:
test_labels[29]

#### 