<a href="https://colab.research.google.com/github/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/blob/main/Application.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import re
import nltk
import pickle
import joblib
import requests
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [35]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
!pip install wget



In [37]:
def remove_repeated_chars(text):
 return re.sub(r"(.)\1{2,}", r"\1", text)

In [38]:
def cleanText(text):
  text = text.encode('ascii', 'ignore').decode('ascii') #remove emojis
  text = re.sub(r'@\w+', '', text) # remove users mentions
  text = re.sub(r'htttps?//\S+', '', text) #remove links
  text = re.sub(r'\s+', ' ', text) #remove extra spaces
  text = re.sub(r'\b(rt|user|https)\b', '', text, flags=re.IGNORECASE) #remove some words
  text = remove_repeated_chars(text)

  return text.strip()

In [39]:
def removeStopWords(text):
  stopWords = set(stopwords.words('portuguese'))
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)

  words = text.split()

  filteredWords = [word for word in words if word.lower() not in stopWords]

  cleanedText = ' '.join(filteredWords)

  return cleanedText

In [40]:
def portugueseStemmer(text):
  stemmer = SnowballStemmer('portuguese')

  words = text.split()
  stemmedWords = [stemmer.stem(word) for word in text.split()]
  stemmedText = ' '.join(stemmedWords)

  return stemmedText

In [41]:
!wget https://raw.githubusercontent.com/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/main/assets/toxic_words.csv

--2024-06-09 23:57:10--  https://raw.githubusercontent.com/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/main/assets/toxic_words.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 70725 (69K) [text/plain]
Saving to: ‘toxic_words.csv.1’


2024-06-09 23:57:11 (884 KB/s) - ‘toxic_words.csv.1’ saved [70725/70725]



In [42]:
!wget https://raw.githubusercontent.com/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/main/assets/non_toxic_words.csv

--2024-06-09 23:57:11--  https://raw.githubusercontent.com/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/main/assets/non_toxic_words.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 70725 (69K) [text/plain]
Saving to: ‘non_toxic_words.csv.1’


2024-06-09 23:57:11 (898 KB/s) - ‘non_toxic_words.csv.1’ saved [70725/70725]



In [43]:
df = pd.read_csv('toxic_words.csv')
toxic_words = df['word'].tolist()

df = pd.read_csv('non_toxic_words.csv')
non_toxic_words = df['word'].tolist()

In [44]:
def count_neutral_words(text):
  count = 0
  for word in text.split():
    if ((word in non_toxic_words) and (word in toxic_words)):
      count += 1
  return count

In [45]:
def count_toxic_words(text):
  count = 0
  for word in text.split():
    if word in toxic_words:
      count += 1
  return count

In [46]:
def count_non_toxic_words(text):
  count = 0
  for word in text.split():
    if word in non_toxic_words:
      count += 1
  return count

In [47]:
model = joblib.load('/content/drive/MyDrive/Fatec/Aprendizado de Maquina/O plano/lr_model.pkl')
tfidf_vectorizer = joblib.load('/content/drive/MyDrive/Fatec/Aprendizado de Maquina/O plano/tfidf_vectorizer.pkl')

In [48]:
def makePredict(df):
  df['text'] = df['text'].apply(cleanText)
  df['text'] = df['text'].apply(removeStopWords)
  df['text'] = df['text'].apply(portugueseStemmer)

  df['count_toxic_words'] = df['text'].apply(count_toxic_words)
  df['count_non_toxic_words'] = df['text'].apply(count_non_toxic_words)
  df['count_neutral_words'] = df['text'].apply(count_neutral_words)

  df['count_char'] = df['text'].apply(lambda x: len(x))
  df['count_words'] = df['text'].apply(lambda x: len(x.split()))

  X_text = df['text']
  X_other_features = df[['count_toxic_words', 'count_non_toxic_words', 'count_neutral_words', 'count_char', 'count_words']]

  X_test_tfidf = tfidf_vectorizer.transform(X_text)

  from scipy.sparse import hstack
  X = hstack([X_test_tfidf, X_other_features])
  y = df['label']

  y_pred_test = model.predict(X)

  df['label'] = y_pred_test

  df.head()

In [52]:
print('Verificar se tweets podem ser tóxicos\n-------------------------------------')
print('\n\nPara parar a execução digite "s"')

run = True
tweets = []

while (run == True):
  text = input('Digite um tweet para adicionar a lista de verificação: ')
  if (text.upper() == 'S'):
    run = False
  else:
    tweets.append(text)

df = pd.DataFrame({
  'id': range(len(tweets)),
  'text': tweets,
  'label': ''
})

makePredict(df)

print('\n')
for i, tweet in enumerate(tweets):
  label_description = (lambda x: 'tóxico' if x == 1 else 'não tóxico')(df['label'][i])
  print(f'O tweet: "{tweet}", é classificado como: "{label_description}"')

Verificar se tweets podem ser tóxicos
-------------------------------------


Para parar a execução digite "s"
Digite um tweet para adicionar a lista de verificação: A China acabou de dar um grande passo à frente em IA.  Eles lançaram um modelo de texto para vídeo chamado KLING, e as pessoas estão enlouquecendo por isso.  Aqui estão 10 exemplos imperdíveis:  1. Um homem chinês senta à mesa e come macarrão com hashis
Digite um tweet para adicionar a lista de verificação: • Hoje é dia de 'O que ele te diria?' lido com o teu signo solar
Digite um tweet para adicionar a lista de verificação: Faz um carinho nesse baralho Van 🥲
Digite um tweet para adicionar a lista de verificação: De Camavinga para Mbappé... 🔥   Vem logo temporada 2024/25.
Digite um tweet para adicionar a lista de verificação: e a Vaidebet acaba de chegar em 900 mil seguidores de forma RELÂMPAGO.   depois da rescisão com o SCCP eles já tinham ficado abaixo de 700 mil seguidores e em apenas alguns minutos ganhaeam mais de 20