In [None]:
# DEPS: 

%pip install pandas numpy nltk joblib scikit-learn tensorflow snscrape

# Load data

In [None]:
data_folder = '../data/'

In [None]:
import zipfile
with zipfile.ZipFile(data_folder+'twittes_data.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import pandas as pd

df = pd.read_csv('comentarios_toxicos_ptBR.csv')

df

# Analyzing the data

In [None]:
print('Não Tóxicos: ', len(df[df['toxic'] == 0]))
print('Tóxicos: ', len(df[df['toxic'] == 1]))

# Preprocessing

In [None]:
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('floresta')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
import joblib
import sys  
sys.path.insert(1, data_folder)
from abbreviations_synonyms import abbreviations_synonyms_dict

tagger = joblib.load(data_folder+'POS_tagger_bigram.pkl') # https://github.com/inoueMashuu/POS-tagger-portuguese-nltk/tree/master/trained_POS_taggers

def clean_text(text):
  text = ' '.join([ word for word in text.split(' ') if not word.startswith('@') ])
  text = re.sub(r"[^A-Za-z ]+", '', text) # keep only letters and spaces
  text = text.strip()
  return text

def replace_synonyms_abbreviations(text):
  for abbr_or_syn, full_text in abbreviations_synonyms_dict.items():
    text = re.sub(rf"\b{abbr_or_syn}\b",full_text,text)

  return text

def remove_stop_words(text):
  stopwords_pt = stopwords.words('portuguese')
  
  text_without_sw = [word for word in text.split(' ') if not word in stopwords_pt]
  return (" ").join(text_without_sw)

def lemmatization_nltk(text):
  lemmatizer = WordNetLemmatizer()
  palavras = nltk.word_tokenize(text, language='portuguese')
  lemmas = [lemmatizer.lemmatize(p).lower() for p in palavras]
  return (" ").join(lemmas)

def remove_proper_nouns(text):
  words = []
  for word,tag in tagger.tag(word_tokenize(text)):
    if tag != 'NPROP':
      words.append(word)
  return ' '.join(words)

def normalize_text(text):
  text = clean_text(text)
  text = remove_proper_nouns(text)
  text = text.lower() # outside clean_text because capitalization influences remove_proper_nouns function 
  text = replace_synonyms_abbreviations(text)
  text = remove_stop_words(text)
  text = lemmatization_nltk(text)
  return text


In [None]:
# our own normalization
df = df[df['text'].notna()] # removing nan values
df['text_norm2'] = df['text'].apply(normalize_text)

In [None]:
df.head(50)

# Training NN

In [None]:
# Import functions from sklearn library
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2,random_state=16)
print("Train Data size:", len(train_data))
print("Test Data size", len(test_data))

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tweets_column='text_norm2'

tokenizer.fit_on_texts(train_data[tweets_column])
word_index = tokenizer.word_index
print(word_index)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

In [None]:
from keras.utils import pad_sequences

# The tokens are converted into sequences and then passed to the pad_sequences() function
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data[tweets_column]), maxlen = 30)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data[tweets_column]), maxlen = 30) 

In [None]:
y_train = train_data['toxic']
y_test = test_data['toxic']

print(y_train.shape)
print(y_test.shape)

In [None]:
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten, LSTM, Bidirectional

model = Sequential()
model.add(Embedding(vocab_size, output_dim=2, input_length=30))
model.add(SimpleRNN(15,return_sequences=True))
model.add(SimpleRNN(15))
model.add(Dense(1,activation='sigmoid'))

model.summary() 

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=3, validation_data=(x_test,y_test))

# Testing MODEL

In [None]:
import snscrape.modules.twitter as sntwitter

#recuperando os tweets de uma conta
username = "biologia_braba"
max_tweets = 25

tweets = []
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(f"from:{username}").get_items()):
    if i >= max_tweets:
        break
    tweets.append(tweet)

content_tweets = [tweet.rawContent for tweet in tweets]
print(content_tweets)

In [None]:
import numpy as np

# cálculo de toxicidade 
tweets_normalized = [normalize_text(tweet) for tweet in content_tweets]
tweets_tokens = pad_sequences(tokenizer.texts_to_sequences(tweets_normalized), maxlen = 30)  

y = model(tweets_tokens, training=False)

for i, raw_tweet in enumerate(tweets_normalized):
  print(raw_tweet)
  print(content_tweets[i])
  print(f"Toxidade: {float(y[i])*100}%")
  print('------------------------------------------')

print(f"Média de toxicidade do twitter: {np.mean(y)*100}")

# Export model

In [None]:
import io
import json 

model.save(data_folder+'exported/twitter_toxicity_model')
tokenizer_json = tokenizer.to_json()
with io.open(data_folder+'exported/tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(tokenizer_json, ensure_ascii=False))