In [4]:
import os
import pandas as pd
from functions import get_all_origins, find_pattern_for_quantity, convert_to_grams, relation_qnt_preco, remove_spaces, clean_text
import re
import numpy as np
import ast

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

back_words_contained_in_the_title = "back_words_contained_in_the_title"
back_words_not_contained_in_the_title = "back_words_not_contained_in_the_title"
back_words_without_title = "back_words_without_title"

# df = pd.read_csv("./back_words.csv")

def analisys(file, palavras):
    df = get_all_origins(file)
    df['back_word'] = df['back_word'].str.replace('\d+', '', regex=True)
    df = df[df['back_word'].str.contains('[a-zA-Z]', regex=True)]
    print(df.info())

    grouped_df = df.groupby(['back_word', "row_number"]).size().reset_index(name='count')
    grouped_df = grouped_df[grouped_df['back_word'].isin(palavras)]
    print(grouped_df.sort_values("count", ascending=False))

df = get_all_origins(back_words_contained_in_the_title)
df.head()

Unnamed: 0,ref,back_word,location,subject,row_number
0,3fc276b6,real,36,creatina,1
1,3fc276b6,para,36,creatina,2
2,3fc276b6,ganhos,36,creatina,3
3,3fc276b6,reais,36,creatina,4
4,3fc276b6,a,36,creatina,5


In [5]:
import pandas as pd
import numpy as np

def load_brazilian_portuguese_words():
    personal_pronouns = ["Eu", "Tu", "Ele", "Ela", "Nós", "Vós", "Eles", "Elas", "Mim", "Ti", "Si", "Consigo"]
    oblique_pronouns = ["Me", "Te", "Se", "Nos", "Vos", "O", "A", "Lhe", "Os", "As", "Nos", "Vos", "Se", "Convosco", "Lhes", "Contigo"]
    demonstrative_pronouns = ["Este", "Esse", "Aquele", "Esta", "Essa", "Aquela", "Isto", "Isso", "Aquilo", "Estes", "Esses", "Aqueles", "Estas", "Essas", "Aquelas", "Iste"]
    possessive_pronouns = ["Meu", "Teu", "Seu", "Nosso", "Vosso", "Seu", "Minha", "Tua", "Sua", "Nossa", "Vossa", "Sua", "Meus", "Teus", "Seus", "Nossos", "Vossos", "Minhas", "Tuas", "Suas", "Nossas", "Vossas"]
    indefinite_pronouns = ["Alguém", "Ninguém", "Todo", "Algum", "Nenhum", "Outro", "Muito", "Pouco", "Tanto", "Cada", "Algo", "Tudo", "Nada", "Cada um", "Qualquer", "Poucos", "Muitos", "Vários", "Outrem"]
    relative_pronouns = ["Que", "Qual", "Quem", "Onde", "Cujo", "O qual", "Cuja", "Quanto"]
    interrogative_pronouns = ["Quem", "O que", "Qual", "Quanto", "Onde", "Quando", "Como", "Por que", "Qualquer coisa", "Quanto a"]
    prepositions = ["A", "Ante", "Até", "Após", "Com", "Contra", "De", "Desde", "Em", "Entre", "Para", "Por", "Perante", "Sem", "Sob", "Sobre", "Trás", "Conforme", "Contudo", "Durante", "Exceto", "Mediant", "Menos", "Salvo", "Segundo", "Visto"]
    BRAZIL_PRONOUNS = personal_pronouns + oblique_pronouns + demonstrative_pronouns + possessive_pronouns + indefinite_pronouns + relative_pronouns + interrogative_pronouns + prepositions

    conectores = ['e', 'ou', 'nem', 'mas', 'porque', 'como', 'apesar', 'além', 'entretanto', 'porém', 'todavia', 'logo', 'portanto', 'assim', 'contudo', 'embora', 'ainda', 'também', 'quer', 'seja', 'isto', 'aquilo']

    palavra = [palavra.lower() for palavra in BRAZIL_PRONOUNS + conectores]
    conjugacoes = np.genfromtxt('conjugações.txt', dtype=str)
    dicionario = np.genfromtxt('palavras.txt', dtype=str)
    
    return np.unique(np.concatenate((palavra, conjugacoes, dicionario)))

def preprocess_numbers(df):
    df['back_word'] = df['back_word'].str.replace('\d+', '', regex=True)
    df = df[df['back_word'].str.contains('[a-zA-Z]', regex=True)]
    return df

def pivot_data(df):
    pivot_df = df.pivot_table(index=['ref', 'location', 'subject'], columns='row_number', values='back_word', aggfunc=lambda x: ' '.join(x)).reset_index()
    pivot_df['back_words'] = pivot_df[[1, 2, 3, 4, 5]].apply(lambda x: ','.join(x.dropna()), axis=1)
    result_df = pivot_df[['ref', 'back_words', 'location', 'subject']].rename(columns={'back_words': 'back_word_1, back_word_2, back_word_3, back_word_4, back_word_5'})
    result_df[['back_word_1', 'back_word_2', 'back_word_3', 'back_word_4', 'back_word_5']] = result_df['back_word_1, back_word_2, back_word_3, back_word_4, back_word_5'].str.split(',', expand=True, n=2)
    result_df = result_df.drop(columns=['back_word_1, back_word_2, back_word_3, back_word_4, back_word_5'])
    
    return result_df

def shift_words(df):
    def shift_words_to_right(row):
        words = [row['back_word_1'], row['back_word_2'], row['back_word_3'], row['back_word_4'], row['back_word_5']]
        filtered_words = [w for w in words if w is not None]
        none_filled = [None] * (3 - len(filtered_words))
        return none_filled + filtered_words

    for index, row in df.iterrows():
        new_words = shift_words_to_right(row)
        df.at[index, 'back_word_1'], df.at[index, 'back_word_2'], df.at[index, 'back_word_3'], df.at[index, 'back_word_4'], df.at[index, 'back_word_5'] = new_words[0], new_words[1], new_words[2], new_words[3], new_words[4]
    return df

def normalize_terms(df, dataset, terms):
    mask = np.isin(dataset, terms)
    filtered_data_set = dataset[mask]
    filtered_data_set = np.insert(filtered_data_set, 0, "0")

    for col in ["back_word_1", "back_word_2", "back_word_3", "back_word_4", "back_word_5"]:
        df[col] = df[col].apply(lambda x: np.where(filtered_data_set == x)[0][0] if x in filtered_data_set else x)
        df[col] = df[col].where(~df[col].isnull(), other="0")

    df = df.astype(str)
    return df

def gen_all_terms(df):
    terms = list(df['back_word'].values)
    return np.array(list(set(terms)))

def get_all_terms_from_dfs(dfs):
    terms = []
    for df in dfs:
        terms.extend(list(df['back_word'].values))
    return np.array(list(set(terms)))

def preprocess_dataframe(df, terms):
    df = preprocess_numbers(df)
    df = pivot_data(df)
    df = shift_words(df)
    df = normalize_terms(df, dataset, terms)
    return df

def process_dataframes(df_contained, df_not_contained):
    terms_1 = gen_all_terms(df_contained)
    df_contained = preprocess_dataframe(df_contained, terms_1)
    df_contained['title'] = 1

    terms_2 = gen_all_terms(df_not_contained)
    df_not_contained = preprocess_dataframe(df_not_contained, terms_2)
    df_not_contained['title'] = 0

    df = pd.concat([df_contained, df_not_contained])

    df["location"] = df["location"].astype(int)
    df["title"] = df["title"].astype(str)

    combined_terms = np.concatenate((terms_1, terms_2))
    terms = np.unique(combined_terms)
    df = normalize_terms(df, dataset, terms)
    return df

dataset = load_brazilian_portuguese_words()
df_contained_in_the_title = get_all_origins(back_words_contained_in_the_title)
df_not_contained_in_the_title = get_all_origins(back_words_not_contained_in_the_title)
df = process_dataframes(df_contained_in_the_title, df_not_contained_in_the_title)

df.info()
df.to_csv("./back_words.csv", index=False)

ValueError: Columns must be same length as key

In [3]:
df = pd.read_csv("./back_words.csv")
df.head()

df["ref"] = df["ref"].astype(str)
df["location"] = df["location"].astype(int)
df["target"] = df["title"].astype(str)

df = df.drop(columns=['title', 'ref', 'subject', 'back_word_1'])

df.info()

df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69372 entries, 0 to 69371
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   location     69372 non-null  int64 
 1   back_word_2  69372 non-null  object
 2   back_word_3  69347 non-null  object
 3   target       69372 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.1+ MB


Unnamed: 0,location,back_word_2,back_word_3,target
0,243,214,824,1
1,9,0,1791,1
2,11,2222,2628,1
3,64,651,160,1
4,134,b,1,1


In [149]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

le = LabelEncoder()
for col in ['back_word_1', 'back_word_2', 'back_word_3', 'target']:
    df[col] = le.fit_transform(df[col])

# Separar em X (recursos) e y (alvo)
X = df.drop('target', axis=1)
y = df['target']

# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Treinar o modelo Decision Tree
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Prever no conjunto de teste
y_pred = model.predict(X_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy}')

Acurácia: 0.9511280905355727


In [150]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    # Definir os modelos
    models = {
        'Extra Trees': ExtraTreesClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'DecisionTree': DecisionTreeClassifier(random_state=42)
    }

    # Treinar cada modelo e avaliar a acurácia
    accuracies = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[name] = accuracy
        print(f'{name} Acurácia: {accuracy}')

    return accuracies

train_and_evaluate_models(X_train, X_test, y_train, y_test)

Extra Trees Acurácia: 0.94838895696677
Random Forest Acurácia: 0.9462985655589995
XGBoost Acurácia: 0.9332516398760181


{'Extra Trees': 0.94838895696677,
 'Random Forest': 0.9462985655589995,
 'XGBoost': 0.9332516398760181}