In [1]:
import os
import pandas as pd
from functions import get_all_origins, find_pattern_for_quantity, convert_to_grams, relation_qnt_preco, remove_spaces, clean_text
import re
import numpy as np
import ast

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

back_words_contained_in_the_title = "back_words_contained_in_the_title"
back_words_not_contained_in_the_title = "back_words_not_contained_in_the_title"
back_words_without_title = "back_words_without_title"


In [4]:
import pandas as pd
import numpy as np

def load_brazilian_portuguese_words():
    personal_pronouns = ["Eu", "Tu", "Ele", "Ela", "Nós", "Vós", "Eles", "Elas", "Mim", "Ti", "Si", "Consigo"]
    oblique_pronouns = ["Me", "Te", "Se", "Nos", "Vos", "O", "A", "Lhe", "Os", "As", "Nos", "Vos", "Se", "Convosco", "Lhes", "Contigo"]
    demonstrative_pronouns = ["Este", "Esse", "Aquele", "Esta", "Essa", "Aquela", "Isto", "Isso", "Aquilo", "Estes", "Esses", "Aqueles", "Estas", "Essas", "Aquelas", "Iste"]
    possessive_pronouns = ["Meu", "Teu", "Seu", "Nosso", "Vosso", "Seu", "Minha", "Tua", "Sua", "Nossa", "Vossa", "Sua", "Meus", "Teus", "Seus", "Nossos", "Vossos", "Minhas", "Tuas", "Suas", "Nossas", "Vossas"]
    indefinite_pronouns = ["Alguém", "Ninguém", "Todo", "Algum", "Nenhum", "Outro", "Muito", "Pouco", "Tanto", "Cada", "Algo", "Tudo", "Nada", "Cada um", "Qualquer", "Poucos", "Muitos", "Vários", "Outrem"]
    relative_pronouns = ["Que", "Qual", "Quem", "Onde", "Cujo", "O qual", "Cuja", "Quanto"]
    interrogative_pronouns = ["Quem", "O que", "Qual", "Quanto", "Onde", "Quando", "Como", "Por que", "Qualquer coisa", "Quanto a"]
    prepositions = ["A", "Ante", "Até", "Após", "Com", "Contra", "De", "Desde", "Em", "Entre", "Para", "Por", "Perante", "Sem", "Sob", "Sobre", "Trás", "Conforme", "Contudo", "Durante", "Exceto", "Mediant", "Menos", "Salvo", "Segundo", "Visto"]
    BRAZIL_PRONOUNS = personal_pronouns + oblique_pronouns + demonstrative_pronouns + possessive_pronouns + indefinite_pronouns + relative_pronouns + interrogative_pronouns + prepositions

    conectores = ['e', 'ou', 'nem', 'mas', 'porque', 'como', 'apesar', 'além', 'entretanto', 'porém', 'todavia', 'logo', 'portanto', 'assim', 'contudo', 'embora', 'ainda', 'também', 'quer', 'seja', 'isto', 'aquilo']

    palavra = [palavra.lower() for palavra in BRAZIL_PRONOUNS + conectores]
    conjugacoes = np.genfromtxt('conjugações.txt', dtype=str)
    dicionario = np.genfromtxt('palavras.txt', dtype=str)
    
    return np.unique(np.concatenate((palavra, conjugacoes, dicionario)))

def preprocess_numbers(df):
    df['back_word'] = df['back_word'].str.replace('\d+', '', regex=True)
    df = df[df['back_word'].str.contains('[a-zA-Z]', regex=True)]
    return df

def pivot_data(df):
    pivot_df = df.pivot_table(index=['ref', 'location', 'subject'], columns='row_number', values='back_word', aggfunc=lambda x: ' '.join(x)).reset_index()
    pivot_df['back_words'] = pivot_df[[1, 2, 3]].apply(lambda x: ','.join(x.dropna()), axis=1)
    result_df = pivot_df[['ref', 'back_words', 'location', 'subject']].rename(columns={'back_words': 'back_word_1, back_word_2, back_word_3'})
    result_df[['back_word_1', 'back_word_2', 'back_word_3']] = result_df['back_word_1, back_word_2, back_word_3'].str.split(',', expand=True, n=2)
    result_df = result_df.drop(columns=['back_word_1, back_word_2, back_word_3'])
    
    return result_df

def shift_words(df):
    def shift_words_to_right(row):
        words = [row['back_word_1'], row['back_word_2'], row['back_word_3']]
        filtered_words = [w for w in words if w is not None]
        none_filled = [None] * (3 - len(filtered_words))
        return none_filled + filtered_words

    for index, row in df.iterrows():
        new_words = shift_words_to_right(row)
        df.at[index, 'back_word_1'], df.at[index, 'back_word_2'], df.at[index, 'back_word_3'] = new_words[0], new_words[1], new_words[2]
    return df

def normalize_terms(df, dataset, terms):
    mask = np.isin(dataset, terms)
    filtered_data_set = dataset[mask]
    filtered_data_set = np.insert(filtered_data_set, 0, "0")

    for col in ["back_word_1", "back_word_2", "back_word_3"]:
        df[col] = df[col].apply(lambda x: np.where(filtered_data_set == x)[0][0] if x in filtered_data_set else x)
        df[col] = df[col].where(~df[col].isnull(), other="0")

    df = df.astype(str)
    return df

def gen_all_terms(df):
    terms = list(df['back_word'].values)
    return np.array(list(set(terms)))

def get_all_terms_from_dfs(dfs):
    terms = []
    for df in dfs:
        terms.extend(list(df['back_word'].values))
    return np.array(list(set(terms)))

def preprocess_dataframe(df, terms):
    df = preprocess_numbers(df)
    df = pivot_data(df)
    df = shift_words(df)
    df = normalize_terms(df, dataset, terms)
    return df

def process_dataframes(df_contained, df_not_contained):
    terms_1 = gen_all_terms(df_contained)
    df_contained = preprocess_dataframe(df_contained, terms_1)
    df_contained['title'] = 1

    terms_2 = gen_all_terms(df_not_contained)
    df_not_contained = preprocess_dataframe(df_not_contained, terms_2)
    df_not_contained['title'] = 0

    df = pd.concat([df_contained, df_not_contained])

    df["location"] = df["location"].astype(int)
    df["title"] = df["title"].astype(int)
    df = df.drop(columns=['ref', 'subject'])

    combined_terms = np.concatenate((terms_1, terms_2))
    terms = np.unique(combined_terms)
    df = normalize_terms(df, dataset, terms)
    return df


dataset = load_brazilian_portuguese_words()
df_contained_in_the_title = get_all_origins(back_words_contained_in_the_title)
df_not_contained_in_the_title = get_all_origins(back_words_not_contained_in_the_title)
df = process_dataframes(df_contained_in_the_title, df_not_contained_in_the_title)

df.info()



<class 'pandas.core.frame.DataFrame'>
Index: 69362 entries, 0 to 45758
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   location     69362 non-null  object
 1   back_word_1  69362 non-null  object
 2   back_word_2  69362 non-null  object
 3   back_word_3  69362 non-null  object
 4   title        69362 non-null  object
dtypes: object(5)
memory usage: 3.2+ MB
None


In [5]:
df.head()

row_number,location,back_word_1,back_word_2,back_word_3,title
0,243,1739.0,3010.0,468,1
1,9,,,3263,1
2,11,,4723.0,4024,1
3,64,689.0,5389.0,1883,1
4,134,468.0,1579.0,1883,1
