# Importações

In [None]:
!pip install Unidecode
!pip install -U spacy
!python -m spacy download pt_core_news_sm

2023-09-05 21:35:39.565506: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting pt-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.6.0/pt_core_news_sm-3.6.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [None]:
import pandas as pd
import re
import ast
import spacy
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from unidecode import unidecode
from wordcloud import WordCloud

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Definição de funções

### Pré processamento do léxico

In [None]:
def lexicon_preprocess(lexicon_file, regex):

  '''Read a txt file with the depression lexicon, tokenize by transforming the
      file into a python list. Preprocess by deleting ponctuation, acentuation
      and save the file in csv format.'''

  with open(lexicon_file, "r", encoding='latin1') as file:
    lexicon = file.read().split(',')

    lexicon = list(map(lambda x: re.sub(regex, ' ', x), lexicon))
    lexicon = [word.split(' ') for word in lexicon]
    lexicon = [[word for word in sublists if word != ''] for sublists in lexicon]
    df_lexicon = pd.DataFrame({'terms': lexicon})

   # return df_lexicon.to_csv('lexicon_preprocess.csv', encoding = 'utf8', sep = ',', index=False)
  return df_lexicon

### Uniformização dos submissions




In [None]:
def submissions_standardization(submissions_file, title_column, text_column, regex):

    '''Read a csv file with reddit submissions, transform into a pandas dataframe,
       get the texts columns (title and self post/ self text) and unify both columns.
       Then preprocess the unified text column by transforming all the text in lower
       case, deleting ponctuation, acentuation, emojis and, at the and, tokenize the
       text and save all file in csv format again with the new preprocessed column.'''

    df_submissions = pd.read_csv(submissions_file, sep = ',')

    df_submissions[title_column] = df_submissions[title_column].apply(lambda x: str(x))
    df_submissions[text_column] = df_submissions[text_column].apply(lambda x: str(x))

    df_submissions["full_text"] = df_submissions[[title_column, text_column]].apply(" ".join, axis =1)
    df_submissions["full_text"] = df_submissions["full_text"].apply(lambda x: re.sub(regex, ' ', unidecode(x.lower())))
    df_submissions["full_text"] = df_submissions["full_text"].apply(lambda x: x.split(' '))

    #return df_submissions.to_csv('submissions_preprocessed.csv',
    #                             encoding = 'utf8', sep = ',', index=False)
    return df_submissions

### Traduzir gírias da internet para o português formal

In [None]:
def internet_to_portuguese(df, column):
  internt_text_dict = {
    'vc': 'voce',
    'pq': 'porque',
    'blz': 'beleza',
    'tbm': 'tambem',
    'td': 'tudo',
    'flw': 'falou',
    'gnt': 'gente',
    'qnd': 'quando',
    'qd': 'quando',
    'vdd': 'verdade',
    'mt': 'muito',
    'ctz': 'certeza',
    'bjo': 'beijo',
    't+': 'ate mais',
    'tb': 'tambem',
    'nt': 'nao',
    'n' : 'nao',
    'pqna': 'pequena',
    'cmg': 'comigo',
    'qdo': 'quando',
    'dps': 'depois',
    'msm': 'mesmo',
    'mta': 'muita',
    'mtos': 'muitos',
    'vlw': 'valeu',
    'dsclp': 'desculpa',
    'nd': 'nada',
    'bj': 'beijo',
    'qse': 'quase',
    'sqn': 'só que nao',
    'tbém': 'tambem',
    'flws': 'falou',
    'eh': 'e',
    'obg': 'obrigado',
    'kk': 'risos',
    'vl': 'vale',
    'fzr': 'fazer',
    'pfv': 'por favor',
    'sq': 'só que',
    'ngm': 'ninguem',
    'sdds': 'saudades',
    'bomdia': 'bom dia',
    'bm': 'bem',
    'tdb': 'tudo bem',
    'eae': 'e ai',
    'qto': 'quanto',
    'to': 'estou',
    'qnd': 'quando',
    'cm': 'com',
    'q': 'que',
    'dsclpa': 'desculpa',
    'aff': 'exasperacao',
    'agr': 'agora',
    'mtos': 'muitos',
    'bjss': 'beijos',
    'vamo': 'vamos',
    'mtu': 'muito',
    'mt' : 'muito',
    'axo': 'acho',
    'td': 'tudo',
    'smp': 'sempre',
    'qqr': 'qualquer',
    'fzd': 'fazendo',
    'tmj': 'tamo junto',
    'qdo': 'quando',
    'qq': 'qualquer',
    'pf': 'por favor',
    'bomdia': 'bom dia',
    'sdd': 'saudade',
    't+': 'ate mais',
    'gnt': 'gente',
    'nd': 'nada',
    's': 'sim',
    'so': 'so',
    'soq': 'so que',
    'qdo': 'quando',
    'xau': 'tchau',
    'obg': 'obrigado',
    'msm': 'mesmo'
  }
  clean_column = df[column].apply(lambda x: [internt_text_dict[word] if word in internt_text_dict else word for word in x])

  return clean_column

### Contagem de palavras do léxico por submission

In [None]:
def check_for_term(df_lexico, column_lexico, df_submissions, column_submissions):
  total_list = []
  for text in df_submissions[column_submissions]:
    qtd_total = 0
    for term in df_lexico[column_lexico]:
      if term[0] not in text:
        boolean = 0
      else:
        if len(term) == 1:
          boolean = 1
        elif len(term) > 1:
          idx = text.index(term[0])
          qtd_term = 0
          for index_in_term in range(len(term)):
            if len(text) >= (idx+len(term)):
              for index_in_text in range(idx, (idx+len(term))):
                if term[index_in_term] == text[index_in_text]:
                  qtd_term = qtd_term + 1
                  #print(qtd_term)
            else:
              pass
          if qtd_term == len(term):
            boolean = 1
          else:
            boolean = 0
      qtd_total = qtd_total + boolean
    total_list.append(qtd_total)
  return total_list

### Separar df_amostra e df_controle

In [None]:
def split_groups(df, column, n_cutoff):
  df_sample = df[df[column] > n_cutoff]
  df_sample.reset_index(drop=True, inplace = True)
  df_control = df[df[column] <= n_cutoff]
  df_control.reset_index(drop=True, inplace = True)

  return df_sample, df_control

### Stop Words

#### Create

In [None]:
def create_stopwords (df, column, contextual_stopwords):
  words_by_pos = []
  spacy_pt = spacy.load("pt_core_news_sm")
  for text_list in df[column]:
    for word in text_list:
      doc = spacy_pt(word)
      for token in doc:
        if token.pos_ not in ('NOUN', 'ADJ'):
          words_by_pos.append(token.text)
        else:
          pass


  stop_words = list(stopwords.words('portuguese'))
  stop_words.extend(list(tuple(words_by_pos)))
  stop_words.extend(contextual_stopwords)
  return stop_words

#### Remove

In [None]:
def remove_stopwords (df, column, stop_words):
  clean_text = []
  spacy_pt = spacy.load("pt_core_news_sm")
  for text_list in df[column]:
    line_list = []
    for word in text_list:
      doc = spacy_pt(word)
      for token in doc:
        if token.text not in stop_words:
          line_list.append(token.lemma_)
    clean_text.append(line_list)

  return clean_text

# Definição de constantes

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
submissions_file = "/content/drive/MyDrive/tcc/bases_de_dados/bases_cruas/submissions_total_2023_04_06.csv"
lexicon_file = "/content/drive/MyDrive/tcc/bases_de_dados/bases_cruas/lexico_traduzido.txt"
regex = r'[^a-zA-Z0-9À-ÿ\x200b]|_+|\n|x200b'

# Utilização de funções

In [None]:
# Lexico
clean_lexicon = lexicon_preprocess(lexicon_file, regex)

In [None]:
# Uniformização dos textos
df_submissions = submissions_standardization(submissions_file, 'title', 'selftext', regex)

In [None]:
#Traduzir gírias da internet
translate_column = internet_to_portuguese(df_submissions, 'full_text')
df_submissions['full_text'] = translate_column

In [None]:
#Contar palavras do lexico no texto
qtd_list = check_for_term(clean_lexicon, 'terms', df_submissions, 'full_text')
df_submissions['qtd_term'] = qtd_list

In [None]:
#Separar amostra de grupo de controle
df_sample, df_control = split_groups(df_submissions, 'qtd_term', 5)

In [None]:
#Criar lista de stop words
contextual_stopwords = ['pra', 'nao', 'ate', 'ja', 'etc', 'porem', 'ta', 'estao', 'pro', 'alguma', 'vao', 'voce', 'apos', 'ne', 'muita', 'mim', 'dela',
                 'literalmente', 'gabriel', 'entao', 'que', 'la', 'vez', 'coisa', 'carar', 'ai', 'tipo', 'mesmo', 'ha', 'atra', 'voce', 'fico', 'parte',
                  'ideiar', 'sao', 'mesmo', 'atra', 'faco', 'vejo',  'claro', 'ideiar']
stopwords = create_stopwords (df_sample, 'full_text', contextual_stopwords)

In [None]:
#Retirar stopwords do texto
df_sample['clean_text'] = remove_stopwords(df_sample, 'full_text', stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['clean_text'] = remove_stopwords(df_sample, 'full_text', stopwords)


# Salvar em csv

In [None]:
df_sample.to_csv('/content/drive/MyDrive/tcc/bases_de_dados/bases_tratadas/base_preprocessada.csv', sep = ',', encoding = 'utf-8', index = False)