In [None]:
!pip install tpot

In [None]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
import warnings
# warnings.filterwarnings('ignore')
from tpot import TPOTClassifier

# Train for binary classification

In [None]:
df = pd.read_csv('/kaggle/input/hatebr/hate-br.csv')

df['offensive_language'] = df['offensive_language'].astype(str).str.upper().map({'TRUE': 1, 'FALSE': 0})

X_text = df['instagram_comments']
y = df['offensive_language']

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    random_state=42,
    n_jobs=-1,
    config_dict='TPOT sparse'
)

tpot.fit(X_train, y_train)

y_pred = tpot.predict(X_test)
print("Acurácia:", accuracy_score(y_test, y_pred))

tpot.export('melhor_pipeline.py')

joblib.dump(tpot.fitted_pipeline_, 'modelo_ofensivo.pkl')
joblib.dump(vectorizer, 'vetorizador.pkl')

In [None]:
modelo = joblib.load('/kaggle/input/automl-binary/modelo_ofensivo.pkl')
vectorizer = joblib.load('/kaggle/input/automl-binary/vetorizador.pkl')

novo_comentario = ["Essa pessoa é nojenta!"]
X_novo = vectorizer.transform(novo_comentario)

pred = modelo.predict(X_novo)
print("É ofensivo?" , bool(pred[0])) 

É ofensivo? True


In [None]:
y_pred = modelo.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Não ofensivo', 'Ofensivo']))

# Train for multilabel classification

In [None]:
df = pd.read_csv('/kaggle/input/unified-hate/unified-hate.csv')


label_cols = ["insult", "obscene", "ideology", "lgbtqphobia", "racism", "sexism", "xenophobia"]

# df[label_cols] = df[label_cols].astype(str).apply(lambda x: x.str.upper().map({'TRUE': 1, 'FALSE': 0}))

X_text = df['text']
y = df[label_cols]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    random_state=42,
    n_jobs=-1,
    config_dict='TPOT sparse'
)

modelo = MultiOutputClassifier(tpot)

modelo.fit(X_train, y_train)

y_pred = modelo.predict(X_test)

for i, label in enumerate(label_cols):
    print(f"\nMétricas para: {label}")
    print(classification_report(y_test[label], y_pred[:, i]))

In [None]:
for i, label in enumerate(label_cols):
    tpot_pipeline = modelo.estimators_[i].fitted_pipeline_
    joblib.dump(tpot_pipeline, f'modelo_{label}.pkl')

In [22]:
joblib.dump(vectorizer, 'vetorizador_multilabel.pkl')

['vetorizador_multilabel.pkl']

In [None]:

label_cols = ["insult", "obscene", "ideology", "lgbtqphobia", "racism", "sexism", "xenophobia"]
vectorizer = joblib.load('/kaggle/input/automl-multilabel/vetorizador_multilabel.pkl')

pipelines = [joblib.load(f'/kaggle/input/automl-multilabel/modelo_{label}.pkl') for label in label_cols]

modelo = MultiOutputClassifier(estimator=None)
modelo.estimators_ = pipelines


In [None]:
base = "lula"
path = "/kaggle/input/2500-samples/lula_2500k.xlsx"
save_path = ""
df = pd.read_excel(path, dtype={'id': str})
df = df.dropna(subset=['tweet'])
# df['id'] = df['id'].astype(str)

X = vectorizer.transform(df['tweet'].astype(str))

y_pred = [modelo.predict(X) for modelo in pipelines] 

y_pred_array = np.array(y_pred).T 

df_predicoes = pd.DataFrame(y_pred_array, columns=label_cols)

df_resultado = pd.concat([df[['id', 'tweet']], df_predicoes], axis=1)
df_resultado["id"] = df_resultado["id"].astype(str)
# print(df_resultado.head())

df_resultado.to_excel(save_path, index=False)
df_resultado.to_csv(save_path, index=False)