In [122]:
!python3 -m spacy download en_core_web_lg

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [255]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import yake
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords


In [124]:
nlp = spacy.load("en_core_web_lg")

In [256]:
nltk.download("stopwords")
stp_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/luankaio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [265]:
def remover_stopwords(textos):
    textos_sem_stopwords = []
    for texto in textos:
        # Processar o texto com spaCy
        doc = nlp(texto)
        
        # Filtrar tokens que não são stopwords
        tokens_sem_stopwords = [token.text for token in doc if not token.is_stop]
        
        # Juntar os tokens de volta em um texto sem stopwords
        texto_sem_stopwords = ' '.join(tokens_sem_stopwords)
        
        textos_sem_stopwords.append(texto_sem_stopwords)
    
    return textos_sem_stopwords

In [262]:
data = pd.read_csv('spam.csv')

In [263]:
texto = data['Message']
previ = data['Category']

In [266]:
texto = remover_stopwords(texto)

In [267]:
# Função para calcular os pesos TF-IDF
def calcular_tfidf(textos):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(textos)
    words = vectorizer.get_feature_names_out()
    tfidf_weights = tfidf_matrix.toarray()
    return words, tfidf_weights

In [268]:
# Função para calcular os pesos YAKE
def calcular_yake(textos):
    kw_extractor = yake.KeywordExtractor(lan="en")
    yake_weights = []
    for texto in textos:
        keywords = kw_extractor.extract_keywords(texto)
        yake_weights.append({kw: score for kw, score in keywords})
    return yake_weights

In [269]:
# Processar os textos com spaCy para lematização
lemmatized_texts = [" ".join([token.lemma_ for token in nlp(texto)]) for texto in texto]

In [270]:
# Calcular os pesos TF-IDF
words, tfidf_weights = calcular_tfidf(lemmatized_texts)

In [271]:
# Calcular os pesos YAKE
yake_weights = calcular_yake(texto)

In [272]:
if len(textos) != len(yake_weights):
    raise ValueError("O número de textos não corresponde ao número de pesos YAKE calculados.")


In [273]:
word_indices = {word: idx for idx, word in enumerate(words)}

In [274]:
combined_weights = np.zeros((len(textos), len(words)))

In [369]:
for i in range(len(textos)):
    for word, tfidf_score in zip(words, tfidf_weights[i]):
        yake_score = yake_weights[i].get(word, 0)
        combined_weights[i][word_indices[word]] = tfidf_score * (1-yake_score)  # Combina os pesos (1 - yake_score) porque YAKE scores são inversamente proporcionais

In [370]:
X_train, X_val, y_train, y_val = train_test_split(tfidf_weights, previ, test_size=0.1, random_state=42)


In [371]:
floresta = RandomForestClassifier(n_estimators=500, random_state=42)

In [372]:
%%time
floresta.fit(X_train, y_train)

CPU times: user 1min 26s, sys: 253 ms, total: 1min 27s
Wall time: 1min 27s


In [373]:
previsoes = floresta.predict(X_test)

In [374]:
combined_weights

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [375]:
tfidf_weights

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [376]:
yake_weights

[{'jurong point': 0.050888965170882816,
  'crazy': 0.06588837669267192,
  'jurong': 0.2200560909321288,
  'point': 0.2200560909321288,
  'buffet': 0.4949246952252326,
  'Cine': 0.5880798524606783,
  'wat': 0.5880798524606783,
  'bugis': 0.68796622281612,
  'great': 0.68796622281612,
  'world': 0.68796622281612,
  'amore': 0.7625957055371981,
  'Cine got amore': 0.8131286944405928,
  'amore wat': 0.8131286944405928,
  'bugis n great': 0.8986050855370177,
  'great world': 0.8986050855370177},
 {'lar': 0.2005079697193566,
  'Joking wif': 0.4456055016437946,
  'wif u oni': 0.4456055016437946,
  'Joking': 0.46553351027698087,
  'oni': 0.46553351027698087,
  'wif': 0.6621399683957523},
 {'Cup final tkts': 0.0017407816134897426,
  'std txt rate': 0.002049543875182285,
  'receive entry question': 0.002960943606284211,
  'wkly comp win': 0.0034253872289706886,
  'win FA Cup': 0.01600240428718544,
  'Cup final': 0.01600240428718544,
  'wkly comp': 0.018818151346519908,
  'final tkts': 0.01881815

In [377]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = floresta.predict(X_val)

accuracy_score(y_test, previsoes)#apenas mensagens


0.9976076555023924