Import de Bibliotecas

In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from nltk.stem import PorterStemmer

# Carregar o corpus
corpus = pd.read_csv('annotated.csv')
num_tweets = len(corpus)
print(num_tweets)

# Carregar o modelo spaCy e o stemmer NLTK
nlp = spacy.load('en_core_web_sm')
stemmer = PorterStemmer()

# Stop words do spaCy
stop_words = nlp.Defaults.stop_words

Pré-processamento


In [None]:

# Função de pré-processamento
def preprocess_text(text):
    text = re.sub(r'https?:\/\/\S+', '', text)  # Remover URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remover símbolos e caracteres especiais
    text = text.lower()
    doc = nlp(text)
    processed_tokens = []
    for token in doc:
        if token.text not in STOP_WORDS and not token.is_punct and not token.is_space:
            lemma = token.lemma_  # Lemmatization
            stemmed = stemmer.stem(lemma)  # Stemming
            processed_tokens.append(stemmed)
    return ' '.join(processed_tokens)

corpus['preprocessed_text'] = corpus['text'].apply(preprocess_text)

# Outras etapas de pré-processamento e treinamento do modelo permanecem as mesmas

def count_entities(text):
    doc = nlp(text)
    return len(doc.ents)

corpus['entity_count'] = corpus['preprocessed_text'].apply(count_entities)

Preparação de dados para o modelo

In [None]:
X = corpus['preprocessed_text']
y = corpus['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=3, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# LDA for Topic Modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)
X_train_topics = lda.fit_transform(X_train_tfidf)
X_test_topics = lda.transform(X_test_tfidf)

# Concatenating TF-IDF Vectors with Entity Counts and Topics
X_train_final = np.hstack((X_train_tfidf.toarray(), X_train_topics, corpus.loc[X_train.index, 'entity_count'].values[:, None]))
X_test_final = np.hstack((X_test_tfidf.toarray(), X_test_topics, corpus.loc[X_test.index, 'entity_count'].values[:, None]))

Treinamento do Modelo

In [None]:
#clf = MultinomialNB()
#clf.fit(X_train_final, y_train)

#y_pred = clf.predict(X_test_final)

# SVM Model
svm_clf = SVC(kernel='linear')  # Você pode experimentar com diferentes kernels como 'rbf', 'poly', etc.
svm_clf.fit(X_train_final, y_train)

y_pred = svm_clf.predict(X_test_final)

Métricas

In [None]:
# Métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Precisão: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')
print(f'Acurácia: {accuracy:.2f}')

Gráficos

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=corpus['annotation'].unique(), yticklabels=corpus['annotation'].unique())
plt.xlabel('Previsão do Modelo')
plt.ylabel('Realidade')
plt.title('Matriz de Confusão')
plt.show()
