<a href="https://colab.research.google.com/github/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/blob/main/Training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import nltk
import joblib
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=52b7b74a2eb6687eab97cf17b1d6b7da5afe117b8a601e859ce9f697042ff243
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [4]:
!wget https://raw.githubusercontent.com/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/main/assets/train.csv

--2024-06-08 19:14:53--  https://raw.githubusercontent.com/HugoLeda/ML-Olympiad-Toxic-Language-PTBR-Detection/main/assets/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1573871 (1.5M) [text/plain]
Saving to: ‘train.csv’


2024-06-08 19:14:54 (20.6 MB/s) - ‘train.csv’ saved [1573871/1573871]



In [5]:
df = pd.read_csv('train.csv')
df.shape

(16800, 2)

In [6]:
df.columns

Index(['text', 'label'], dtype='object')

In [7]:
df.head()

Unnamed: 0,text,label
0,"rt @user olha quem chegouuuuu, nossos queridin...",0
1,veio umas teorias muito loucas na minha cabeça...,1
2,@user @user 😂😂😂😂mais nao tinha falado ontem qu...,0
3,rt @user quer ser filha da puta logo comigo qu...,1
4,vai besta 😂😂😂😂 casquei com a ultima foto,1


In [8]:
#clean tweets
def remove_repeated_chars(text):
 return re.sub(r"(.)\1{2,}", r"\1", text)

def cleanText(text):
  text = text.encode('ascii', 'ignore').decode('ascii') #remove emojis
  text = re.sub(r'@\w+', '', text) # remove users mentions
  text = re.sub(r'htttps?//\S+', '', text) #remove links
  text = re.sub(r'\s+', ' ', text) #remove extra spaces
  text = re.sub(r'\b(rt|user|https)\b', '', text, flags=re.IGNORECASE) #remove some words
  text = remove_repeated_chars(text)

  return text.strip()

In [9]:
#remove stop words
def removeStopWords(text):
  stopWords = set(stopwords.words('portuguese'))
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)

  words = text.split()

  filteredWords = [word for word in words if word.lower() not in stopWords]

  cleanedText = ' '.join(filteredWords)

  return cleanedText

In [10]:
def portugueseStemmer(text):
  stemmer = SnowballStemmer('portuguese')

  words = text.split()
  stemmedWords = [stemmer.stem(word) for word in text.split()]
  stemmedText = ' '.join(stemmedWords)

  return stemmedText

In [11]:
df['text'] = df['text'].apply(cleanText)
df['text'] = df['text'].apply(removeStopWords)
df['text'] = df['text'].apply(portugueseStemmer)

In [12]:
df.head()

Unnamed: 0,text,label
0,olha cheg queridinh vem dir fem 1015 masc 2540...,0
1,vei umas teor louc cab agor pqp to assust,1
2,nao fal ontem ia patrocin nad pud viol moral f...,0
3,quer filh put log comig 50x pior k fic sapatinh,1
4,vai best casqu ultim fot,1


In [13]:
def get_toxic_words(df):
  toxic_words_list = set()
  for text in df[df['label'] == 1]['text'].str.split():
    toxic_words_list.update(text)
  return toxic_words_list

def get_non_toxic_words(df):
  toxic_words_list = set()
  for text in df[df['label'] == 1]['text'].str.split():
    toxic_words_list.update(text)
  return toxic_words_list

In [14]:
toxic_words = get_toxic_words(df)
non_toxic_words = get_toxic_words(df)

In [15]:
def count_neutral_words(text):
  count = 0
  for word in text.split():
    if ((word in non_toxic_words) and (word in toxic_words)):
      count += 1
  return count

In [16]:
def count_toxic_words(text):
  count = 0
  for word in text.split():
    if word in toxic_words:
      count += 1
  return count

In [17]:
def count_non_toxic_words(text):
  count = 0
  for word in text.split():
    if word in non_toxic_words:
      count += 1
  return count

In [18]:
df['count_toxic_words'] = df['text'].apply(count_toxic_words)
df['count_non_toxic_words'] = df['text'].apply(count_non_toxic_words)
df['count_neutral_words'] = df['text'].apply(count_neutral_words)

df['count_char'] = df['text'].apply(lambda x: len(x))
df['count_words'] = df['text'].apply(lambda x: len(x.split()))

In [19]:
df.head(20)

Unnamed: 0,text,label,count_toxic_words,count_non_toxic_words,count_neutral_words,count_char,count_words
0,olha cheg queridinh vem dir fem 1015 masc 2540...,0,5,5,5,63,11
1,vei umas teor louc cab agor pqp to assust,1,9,9,9,41,9
2,nao fal ontem ia patrocin nad pud viol moral f...,0,10,10,10,59,11
3,quer filh put log comig 50x pior k fic sapatinh,1,10,10,10,47,10
4,vai best casqu ultim fot,1,5,5,5,24,5
5,sei oq chocant botafoguens botafoguens fal tor...,1,8,8,8,55,8
6,pois man cois chat porr,0,5,5,5,23,5
7,odei sent dio algum pq t trs porr tel celul po...,1,14,14,14,60,14
8,ganh rasp cab tcoucfwfabm sorteiochipart,0,2,2,2,40,5
9,vlw man tmj,0,3,3,3,11,3


In [20]:
X_text = df['text']
X_other_features = df[['count_toxic_words', 'count_non_toxic_words', 'count_neutral_words', 'count_char', 'count_words']]

In [21]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), smooth_idf=True)
matriz_tfidf = vectorizer.fit_transform(df['text']).toarray()

In [22]:
tfidf_vectorizer = TfidfVectorizer()
X_text= tfidf_vectorizer.fit_transform(X_text)

In [23]:
from scipy.sparse import hstack
X = hstack([X_text, X_other_features])
y = df['label']

In [24]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [25]:
df.head()

Unnamed: 0,text,label,count_toxic_words,count_non_toxic_words,count_neutral_words,count_char,count_words
0,olha cheg queridinh vem dir fem 1015 masc 2540...,0,5,5,5,63,11
1,vei umas teor louc cab agor pqp to assust,1,9,9,9,41,9
2,nao fal ontem ia patrocin nad pud viol moral f...,0,10,10,10,59,11
3,quer filh put log comig 50x pior k fic sapatinh,1,10,10,10,47,10
4,vai best casqu ultim fot,1,5,5,5,24,5


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=600),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}

results = []

for name, model in models.items():
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)

  accuracy_train = accuracy_score(y_train, y_pred_train)
  accuracy_test = accuracy_score(y_test, y_pred_test)

  report_train = classification_report(y_train, y_pred_train, output_dict=True)
  report_test = classification_report(y_test, y_pred_test, output_dict=True)

  results.append({
      'Model': name,
      'Train accuracy': accuracy_train,
      'Test accuracy': accuracy_test,
      **report_train['weighted avg'],
      **report_test['weighted avg']
  })

df_results = pd.DataFrame(results)

df_results_sorted = df_results.sort_values(by='Train accuracy', ascending=False)

def style_format(val):
  if val == df_results_sorted.iloc[0]['Model']:
    color = 'green'
  elif val == df_results_sorted.iloc[1]['Model']:
    color = 'yellow'
  elif val == df_results_sorted.iloc[2]['Model']:
    color = 'red'
  else:
    color = 'black'
  return f'color: {color}; text-align: left;'

styled_df = df_results_sorted.style.applymap(style_format, subset=['Model'])

styled_df

Unnamed: 0,Model,Train accuracy,Test accuracy,precision,recall,f1-score,support
0,Random Forest,0.993676,0.761905,0.76118,0.761905,0.760551,3360
4,Decision Tree,0.993676,0.731548,0.731792,0.731548,0.73166,3360
1,Logistic Regression,0.883259,0.836012,0.838274,0.836012,0.836445,3360
3,KNN,0.81131,0.722024,0.72833,0.722024,0.723016,3360
2,SVM,0.677232,0.661012,0.659205,0.661012,0.659599,3360


In [28]:
lr_model = LogisticRegression(max_iter=600)

In [29]:
lr_model.fit(X_train, y_train)

In [30]:
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

In [31]:
print("ClassificationReport (Training Set):\n", classification_report(y_train, y_pred_train))

ClassificationReport (Training Set):
               precision    recall  f1-score   support

           0       0.93      0.86      0.89      7544
           1       0.84      0.91      0.87      5896

    accuracy                           0.88     13440
   macro avg       0.88      0.89      0.88     13440
weighted avg       0.89      0.88      0.88     13440



In [32]:
print("ClassificationReport (Test Set):\n", classification_report(y_test, y_pred_test))

ClassificationReport (Test Set):
               precision    recall  f1-score   support

           0       0.87      0.83      0.85      1881
           1       0.79      0.85      0.82      1479

    accuracy                           0.84      3360
   macro avg       0.83      0.84      0.83      3360
weighted avg       0.84      0.84      0.84      3360



In [33]:
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
percentage_difference = (accuracy_train-accuracy_test) * 100
print("Diferença Percentual:", percentage_difference, "%")

Diferença Percentual: 4.724702380952383 %


In [34]:
joblib.dump(lr_model, 'lr_model.pkl')

['lr_model.pkl']

In [35]:
df = pd.DataFrame({
    'id': range(len(y_pred_test)),
    'label': y_pred_test
})

df.to_csv('sample_submission.csv', index=False, header=True)