# Projeto final de Deep Learning 
## Equipe: Denilson Pedro, Gregory Lira, Lincoln Wallace
### Professor: Tiago Maritan 



## Análise de sentimentos: Impacto da tradução nos conjuntos de dados. 

In [37]:
!pip3 install transformers
!pip3 install dl-translate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import das bibliotecas:

In [38]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.preprocessing import OneHotEncoder
import re
from nltk import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
import nltk
from sklearn.model_selection import train_test_split
import dl_translate as dlt
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Modelo de classificação de Tweets, treinado com 23 milhões de exemplos em inglês, sobre covid.

In [39]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
  
tokenizerR = AutoTokenizer.from_pretrained("rabindralamsal/BERTsent")

modelR = TFAutoModelForSequenceClassification.from_pretrained("rabindralamsal/BERTsent")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at rabindralamsal/BERTsent.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


## Exemplo de classicação com o modelo citado acima:

In [40]:
example_tweet = "The NEET exams show our Govt in a poor light: unresponsiveness to genuine concerns; admit cards not delivered to aspirants in time; failure to provide centres in towns they reside, thus requiring unnecessary & risky travels. What a disgrace to treat our #Covid warriors like this!"
    
input = tokenizerR.encode(example_tweet, return_tensors="tf")
output = modelR.predict(input)[0]
prediction = tf.nn.softmax(output, axis=1).numpy()
sentiment = np.argmax(prediction)
    
print(prediction)
print(sentiment)

[[0.97267216 0.02368474 0.00364307]]
0


## Importação dos conjuntos de dados que foram usados.

In [6]:
df_portuguese = pd.read_csv("./export_TweetSentBR.csv")
completo = pd.read_csv("./datasetcompolaridade.csv")

## Tratamento dos dados:

In [7]:
completo.drop(columns = ['Unnamed: 0'], inplace = True)

In [8]:
completo['tweets_translater_unicamp'] = completo['tweets_translater_unicamp'].replace('','')
completo['tweets_translater_face'] = completo['tweets_translater_face'].replace('"','')

## Conjunto de dados original:

In [9]:
df_portuguese

Unnamed: 0,id,id_twitter,text,sentiment
0,1343,863044774588272640,Que coisa linda! O Programa #encontro estava m...,1
1,1344,865583716088766467,"Por mais #Encontro com as Irmãs Galvão, adorei...",1
2,1345,865063232201011201,Mr. CATRA @OficialMrCatra lançando sua nova mú...,1
3,1346,864668391008763905,quem viu aquela lutadora modela barbuda tatuad...,0
4,1347,865572794016378882,Tô passada com esse cara.... quanta merda pode...,-1
...,...,...,...,...
11560,12908,864636619000877056,eu ja to aqui pronto pro #MasterChefBR mas ain...,-1
11561,12909,863581588713603072,MALUCO! Uma coisa que eu não tenho coragem é e...,-1
11562,12910,864831041349054464,#MaisVoce @ANAMARIABRAGA está linda @RedeGlobo,1
11563,12911,863042798575951872,"Que orgulho de ti, @sportrecife! #Encontro",1


## Pré-processamento das frases:

In [10]:
def pre_X(frases):
    lista = []
    
    for frase in frases:
        lista.append(frase)
        
    return lista

def pre_Y(number):
    lista = []
    
    for numb in number:
        lista.append(numb)
    
    return lista


In [11]:
def set_array(frases):
    
    vocab = []
    palavras = []
    for frase in frases:
        
        text_array = remove_user(frase)
        text_array = Tokenize(text_array)
        text_array = text_array.split(' ')
        for i in range(len(text_array)):
            vocab.append(text_array[i])
    
    
        
    return vocab

def Tokenize(f):     ## Pre-processando a frase
    
    ## Colocando em minusculo
    ## Retirando a pontuaçao
    ## Retirando as StopWords
    
    f = f.lower().replace('\n', '').replace('-','').replace('#','').replace('.','').replace(',','').replace('!','').replace('r\n','').replace('  ','')
    token = RegexpTokenizer(r"\w+")
    f = token.tokenize(f)
    
    stop_words = set(stopwords.words('portuguese'))
    
    new_word = [word for word in f if not word in stop_words]
    
    return ' '.join(new_word)

def remove_user(frase):

    return re.sub('@\w+','',frase)

def translater_frases(frase):
    text_hi = frase
    translater = mt.translate(text_hi, source=dlt.lang.PORTUGUESE, target=dlt.lang.ENGLISH)
    
    return translater

def new_classifier(frase):
    input = tokenizerR.encode(frase, return_tensors="tf")
    output = modelR.predict(input)[0]
    prediction = tf.nn.softmax(output, axis=1).numpy()
    sentiment = np.argmax(prediction)
    
    return sentiment 


## Tratamento do conjunto de dados original:

In [12]:
df_remove = df_portuguese[df_portuguese['sentiment'] == '-']
df_portuguese = df_portuguese.drop(df_remove.index)
df_portuguese = df_portuguese.reset_index()
df_portuguese = df_portuguese.drop(columns=['index'])
df_portuguese['sentiment'] = df_portuguese['sentiment'].apply(lambda x: int(x))
Tweet = df_portuguese['text']
polarity = np.asarray(df_portuguese['sentiment'])

In [13]:
df_portuguese['text'] = df_portuguese['text'].apply(remove_user)
df_portuguese.drop(columns = ['id', 'id_twitter'], inplace = True)

## Conjunto de dados após o tratamento:

In [14]:
df_portuguese

Unnamed: 0,text,sentiment
0,Que coisa linda! O Programa #encontro estava m...,1
1,"Por mais #Encontro com as Irmãs Galvão, adorei...",1
2,Mr. CATRA lançando sua nova música PPK CHORA ...,1
3,quem viu aquela lutadora modela barbuda tatuad...,0
4,Tô passada com esse cara.... quanta merda pode...,-1
...,...,...
11523,eu ja to aqui pronto pro #MasterChefBR mas ain...,-1
11524,MALUCO! Uma coisa que eu não tenho coragem é e...,-1
11525,#MaisVoce está linda,1
11526,"Que orgulho de ti, ! #Encontro",1


## Modelos de tradução usados, são eles o Unicamp dl translation e dl translater m2m100 do Facebook:

In [41]:
tokenizer = AutoTokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
model = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-pt-en-t5")
pten_pipeline = pipeline('text2text-generation', model=model, tokenizer=tokenizer)
mt = dlt.TranslationModel()

## Tratamento dos dados após as traduções:

In [16]:
completo = completo.dropna().reset_index().drop(columns = ['index'])
completo = completo.drop(columns= ['tweets_translater_unicamp'])

## Conjunto de dados completo:

In [17]:
completo

Unnamed: 0,tweets_original,tweets_translater_face,polaridade
0,Que coisa linda! O Programa #encontro estava m...,What a beautiful thing! The #touch program was...,1.0
1,"Por mais #Encontro com as Irmãs Galvão, adorei...","For more I met with the Galvan Sisters, I love...",1.0
2,Mr. CATRA lançando sua nova música PPK CHORA ...,Mr. CATRA releases his new song PPK CHORA on k...,1.0
3,quem viu aquela lutadora modela barbuda tatuad...,Who saw that fighter model tattooed? #MasterCh...,0.0
4,Tô passada com esse cara.... quanta merda pode...,How much shit can come out of someone’s mouth ...,-1.0
...,...,...,...
11505,a animação da filha da vanessa da mata é de se...,The animation of the daughter of the vanessa o...,1.0
11506,eu ja to aqui pronto pro #MasterChefBR mas ain...,I'm ready to go here for #MasterChefBR but I'm...,-1.0
11507,MALUCO! Uma coisa que eu não tenho coragem é e...,One thing I don’t have the courage is that nor...,-1.0
11508,#MaisVoce está linda,#MaisVoce is beautiful,1.0


In [18]:
classificacao = pd.DataFrame( data = {'classificaçãorabin_facetranslater': completo['tweets_translater_face'].apply(new_classifier)})

## Data frame com as classificações dos textos após as traduções com os modelos do Hugging e BERTsent:

In [19]:
classificacao

Unnamed: 0,classificaçãorabin_facetranslater
0,2
1,2
2,1
3,1
4,0
...,...
11505,1
11506,0
11507,0
11508,2


## Replace para comparação das polaridades.

In [20]:
classificacao['classificaçãorabin_facetranslater'] = classificacao['classificaçãorabin_facetranslater'].replace(0, -1).replace(1,0).replace(2,1)

In [21]:
classificacao

Unnamed: 0,classificaçãorabin_facetranslater
0,1
1,1
2,0
3,0
4,-1
...,...
11505,0
11506,-1
11507,-1
11508,1


## Acuracia da origem com classificador do rabin e traduzido com o tradutor do Facebook

In [22]:
soma = 0
for i in range (len(completo)):
      if(int(completo['polaridade'][i]) == int(classificacao['classificaçãorabin_facetranslater'][i])):
        soma+=1

soma/len(classificacao)

0.47002606429192006

## Rede conv1d:

In [31]:
vectorize_layer = TextVectorization(
 max_tokens=15000,
 output_mode='int',
 output_sequence_length=len(max(completo['tweets_original'])))

vocab = set_array(completo['tweets_original'])

vectorize_layer.adapt(np.unique(vocab))
len(vectorize_layer.get_vocabulary())

14727

In [32]:
one = OneHotEncoder(sparse=False)

In [33]:
X_pp = completo['tweets_original'].apply(Tokenize)
y_pp = completo['polaridade']
y_pp = one.fit_transform(y_pp.values.reshape(-1,1))
X_train, X_test, y_train, y_test = train_test_split(X_pp,y_pp, test_size= 0.3)

In [44]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(
        input_dim=len(vectorize_layer.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    
    tf.keras.layers.Conv1D(8,3, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.01)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Flatten(),
    
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')   
])

model.compile(
    optimizer= tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=['accuracy']
)

fit = model.fit(X_train, y_train, epochs=40, batch_size=20, validation_data=(X_test , y_test))
y_pred = model.predict(X_test)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [47]:
print(classification_report(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))

              precision    recall  f1-score   support

           0       0.39      0.52      0.45      1029
           1       0.00      0.00      0.00       867
           2       0.54      0.71      0.61      1557

    accuracy                           0.48      3453
   macro avg       0.31      0.41      0.35      3453
weighted avg       0.36      0.48      0.41      3453



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
X_pi = completo['tweets_translater_face'].apply(Tokenize)
y_pi = completo['polaridade']
y_pi = one.fit_transform(y_pi.values.reshape(-1,1))
X_train, X_test, y_train, y_test = train_test_split(X_pi,y_pi, test_size= 0.3)

In [50]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(
        input_dim=len(vectorize_layer.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    
    tf.keras.layers.Conv1D(8,3, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.01)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Flatten(),
    
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')   
])

model.compile(
    optimizer= tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=['accuracy']
)

fit = model.fit(X_pi, y_pi, epochs=40, batch_size=20, validation_data=(X_test , y_test))
y_predpi = model.predict(X_test)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [51]:
print(classification_report(np.argmax(y_test, axis=1), np.argmax(y_predpi, axis=1)))

              precision    recall  f1-score   support

           0       0.37      0.86      0.51       991
           1       0.00      0.00      0.00       901
           2       0.72      0.52      0.60      1561

    accuracy                           0.48      3453
   macro avg       0.36      0.46      0.37      3453
weighted avg       0.43      0.48      0.42      3453



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
