# Projeto Eleições 2022

## Indíce

## Configurando o Ambiente


### Dependências

In [1]:
# Dependências 
try:
    from google.colab import drive
except:
    pass
from os import path, listdir, getcwd, mkdir
from os.path import isfile, join
import io
import copy
from collections import namedtuple
import json
import time

### Variáveis Globais

In [2]:
_nome = None
_workspace = None
_env_mode = None
_env = None
_query = None

### Funções

In [3]:
def build_environment(workspace):
    if path.exists(workspace + '/configuracao') == False:
        mkdir(workspace + '/configuracao')
        
    if path.exists(workspace + '/Eleicoes_2022_Pesquisa') == False:
        mkdir(workspace + '/Eleicoes_2022_Pesquisa')

    if path.exists(workspace + '/Eleicoes_2022_Pesquisa/global') == False:
        mkdir(workspace + '/Eleicoes_2022_Pesquisa/global')

    if path.exists(workspace + '/Eleicoes_2022_Pesquisa/coleta') == False:
        mkdir(workspace + '/Eleicoes_2022_Pesquisa/coleta')

    if path.exists(workspace + '/Eleicoes_2022_Pesquisa/coleta/hashtags') == False:
        mkdir(workspace + '/Eleicoes_2022_Pesquisa/coleta/hashtags')

    if path.exists(workspace + '/Eleicoes_2022_Pesquisa/coleta/hashtags/configuracao') == False:
        mkdir(workspace + '/Eleicoes_2022_Pesquisa/coleta/hashtags/configuracao')

    if path.exists(workspace + '/Eleicoes_2022_Pesquisa/coleta/ego') == False:
        mkdir(workspace + '/Eleicoes_2022_Pesquisa/coleta/ego')

In [4]:
def init_workspace():
    global _env_mode
    
    workspace = ""
    
    if _env_mode == "colab":
        try:
            drive.mount('/content/drive')
            #necessário executar caso queira pegar ou salvar os arquivos na pasta compartilhada
        except:
            print("Não foi possível montar o Drive.")
            raise Exception
        else:
            return '/drive/MyDrive/Colab Notebooks'

    if _env_mode == "root":
        if path.exists('/content') == False:
            mkdir('/content')
        workspace = '/content'

    elif _env_mode == "relative":
        workspace = "."
        
    else:
        print("Diretórios não criados")
        raise Exception

    build_environment(workspace)

    return workspace


In [5]:
#Bruno
#Função para carregamento de arquivos no drive 
def try_loadfrom_drive(loading_func):
  global _workspace
  
  def wrapped(path, *args, **kwargs):
    error_ignore = kwargs.pop('error_ignore', False)
    
    if path[0] !='/':
        path = '/'+path
  
    if _env_mode == "colab":
      try:
        file = loading_func(_workspace + path, *args, **kwargs)
      except:
        if error_ignore:
          file = None
        else:
          print("Recomendo criar um atalho no seu drive da pasta shareada")
          uploaded = files.upload()
          upload_path = list(uploaded.keys())[0]
          file = loading_func(io.BytesIO(uploaded[path]), *args, **kwargs)
    else:
      file = loading_func(_workspace + path, *args, **kwargs)
    return file
  return wrapped

In [6]:
#Classe para gerenciar variaveis salvas externamente
#Cada usuário terá suas variáveis salvas no seu próprio workspace/google drive ou local
#Cada usuário é identificado através da definição de uma variável "nome"
#As variáveis locais ao usuário são salvas e carregadas de um arquivo, nomeado a partir do "env_name" selecionado
#Quando uma variavel não é definida como privada, uma cópia é feita num arquivo global
#Estas cópias podem ser acessadas através de uma referencia ao nome do detentor de uma variavel(e.g. Bruno)
#Ou através da função get_globals()
#Os arquivos de variaveis possuem o formato json/plain-text
class Environment():
  #Na instaciação de um objeto Env, o arquivo de variáveis "{env_name}.json" será pré-carregado para consulta.
  def __init__(self, env_name):
    self.path = env_name
    env_file = try_loadfrom_drive(open)('/configuracao/'+env_name+'.json', 'r', error_ignore = True)
    self.vars = json.load(env_file) if env_file else {}

  #Função para recuperar uma variável com o nome {var}. Caso a variável ainda 
  #não exista para o usuário, será criada uma entrada nova a partir de um valor 
  #inicial.
  #Caso o valor recuperado seja uma referência(e.g. "$Bruno"), um novo valor será recuperado 
  #do arquivo global, caso haja uma váriavel não privada definida para o usuário
  #referenciado.

  def update(self, g_id):
    if path.exists(_workspace+'/configuracao') == False:
          mkdir(_workspace+'/configuracao')

    env_file = open(_workspace+'/configuracao/'+self.path+'.json', 'w')
    json.dump(self.vars, env_file)
    global_vars = {i:self.vars[i] for i in self.vars if not self.vars[i][1]}
    env_file.close()

    g_path = '/'+self.path+'/global/'+g_id+'.json'
    if path.isfile(g_path):
      global_env_file = try_loadfrom_drive(open)(g_path, 'r', error_ignore = True)
      global_env_vars = json.load(global_env_file)
      global_env_vars.update(global_vars)
      global_env_file.close()

    global_env_file = open(_workspace+g_path, 'w')
    json.dump(global_vars, global_env_file)
    global_env_file.close()

  def set(self, var, value, is_private=None):
    if not is_private:
      try:
        is_private = self.vars[var][1]
      except:
        is_private = False

    self.vars.update({var:[value, is_private]})
    self.update(value if var == 'nome' else _nome)

  def get(self, var, generate_if_missing = True):
    try:
      value = self.vars[var][0]
    except KeyError:
      if not generate_if_missing:
        return None
      else:
        value = input(f'Digite o valor inicial da variável de ambiente "{var}": ')
        private = input('Variavel privada(True/False)?: ')
        private = private == "True" or private == "true"
        self.set(var, value, private)
        
    try:
      if value[0] == '$':
        linked_file = try_loadfrom_drive(open)('/'+self.path+'/global/'+value[1:]+'.json', 'r', error_ignore = True)
        linked_vars = json.load(linked_file)
        value = linked_vars[var][0]
    except KeyError:
      pass

    return value

  def delete(self, var):
    self.vars.pop(var, None)
    self.update(_nome)
    
  #Função helper vara retornar uma lista com todos os valores declarados de uma variável 
  #com o nome {var}, para todos os usuários.
  #Não retorna variaveis privadas.
  def get_globals(self, var):
    g_path = _workspace+'/'+self.path+'/global'
    global_files = [f for f in listdir(g_path) if isfile(join(g_path, f))]

    values = []
    for global_env in global_files:
      if not _nome in global_env:
        global_env_file = try_loadfrom_drive(open)('/'+self.path+'/global/'+global_env, 'r', error_ignore = True)
        global_env_vars = json.load(global_env_file)
        try:
          values.append(global_env_vars[var][0])
        except KeyError:
          pass
          
        global_env_file.close()

    return values

In [7]:
def init_environment(env_mode: str):
    global _nome, _workspace, _env_mode, _env
    
    _env_mode = env_mode
    if not _env_mode:
        raise AttributeError
    
    _workspace = init_workspace()
    #print(_workspace)
    _env = Environment("Eleicoes_2022_Pesquisa")
    _nome = _env.get("nome")

## Configurando Cliente

### Dependências

In [8]:
import tweepy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from http.client import RemoteDisconnected, HTTPException
from time import sleep
from urllib3 import exceptions

### Variáveis Globais

In [9]:
_token_farm = None
_client = {
    "coleta": None,
    "superior": None
}
_user_fields = None
_tweet_fields = None
_media_fields = None
_expansions = None
_dtype = None

### Funções

In [10]:
class tokenFarmClient(tweepy.Client):
  def __init__(
        self, tokens, consumer_key=None, consumer_secret=None,
        access_token=None, access_token_secret=None, *, return_type=tweepy.Response,
        wait_on_rate_limit=False
    ):
    self.tokens = tokens
    self.current_token = 0
    super().__init__(bearer_token=tokens[0], consumer_key=consumer_key, 
                     consumer_secret=consumer_secret, access_token=access_token, 
                     access_token_secret=access_token_secret, return_type=return_type, 
                     wait_on_rate_limit=wait_on_rate_limit)

  def request(
          self, method, route, params=None, json=None, user_auth=False
      ):
    temp_wait_on_rate_limit = self.wait_on_rate_limit
    self.wait_on_rate_limit = False
    response_ok = False
    try:
      response = super().request(method, route, params, json, user_auth)
      response_ok = True
    except (tweepy.TooManyRequests, tweepy.Unauthorized):
      for i in [x+self.current_token for x in range(len(self.tokens))]:
        k = i % len(self.tokens)
        self.bearer_token = self.tokens[k]
        try:
          response = super().request(method, route, params, json, user_auth)
          response_ok = True
          self.current_token = k
          break
        except (tweepy.TooManyRequests, tweepy.Unauthorized):
          pass
    
    self.wait_on_rate_limit = temp_wait_on_rate_limit
    if not response_ok:
      self.current_token += 1
      next = self.current_token
      if next >= len(self.tokens):
        self.current_token = next = 0
      self.bearer_token = self.tokens[next]
      response = super().request(method, route, params, json, user_auth)

    return response


In [11]:
def _init_client():
    global _token_farm

    if _token_farm: 
         return tokenFarmClient(_env.get_globals("chave"), wait_on_rate_limit = False)
    else:
        return tweepy.Client(_env.get("chave"), wait_on_rate_limit= False)
    

In [12]:
def init_client(thread = "all"):
    global _client 

    if thread == "all":
        _client.update({
            "coleta": _init_client(),
            "superior": _init_client(),
            "quotes": _init_client(),
            "replies": _init_client(),
            "likes": _init_client(),
            "retweets": _init_client()
        })
    
    else:
        _client[thread] = init_client()

### Execução


In [13]:
_tweet_fields = [
    'created_at',
    'public_metrics',
    'text',
    'id',
    'conversation_id',
    'entities',
    'referenced_tweets',
    'author_id',
    'lang',
    'source',
    'in_reply_to_user_id',
    'attachments'
]


In [14]:
_user_fields = [
    'id',
    'username',
    'created_at',
    'location',
    'protected',
    'public_metrics',
    'verified',
    'profile_image_url'
]


In [15]:
_media_fields = [
    'type',
    'url',
    'public_metrics',
    'preview_image_url'
]


In [16]:
_expansions = [
    'author_id',
    'attachments.media_keys'
]

In [17]:
_dtype = {
    'id': str,
    'text': str,
    'created_at': str,
    'source': str,
    'lang': str,
    'conversation_id': str,
    'like_count': int,
    'retweet_count': int,
    'quote_count': int,
    'reply_count': int,
    'type': str,
    'referenced_tweet_id': str,
    'mentions': str,
    'hashtags': str,
    'hastags': str,
    'urls': str,
    'author_id': str,
    'media_keys': str
}

## Coleta Hashtags

### Dependências

In [18]:
from datetime import datetime, timedelta
from math import nan
from threading import Thread, Lock



### Variáveis Globais

In [19]:
_tweets_data = {
    'coleta': None,
    'superior': None,
    'quotes': None,
    'replies': None,
    'likes': None,
    'retweets': None
}

_users_data = {
    'coleta': None,
    'superior': None,
    'quotes': None,
    'replies': None,
    'likes': None,
    'retweets': None
}

_media_data = {
    'coleta': None,
    'superior': None,
    'quotes': None,
    'replies': None,
    'likes': None,
    'retweets': None
}

_backup_interval = None
_env_mutex = Lock()
_tweets_mutex = Lock()
_users_mutex = Lock()
_media_mutex = Lock()

### Funções

In [20]:
def __simplify_query(query: str) -> str:
    new_query = query.replace('(', '')[0:30].replace('"', '').replace(':', '_')
    new_query = new_query.replace('ç', 'c').replace('ã', 'a')
    return new_query

In [21]:
def __reset_data_frames(thread: str):
    global _tweets_data, _users_data, _media_data, _tweets_mutex, _users_mutex, _media_mutex
    
    _tweets_mutex.acquire()
    _tweets_data[thread] = pd.DataFrame(columns= ['id','text','created_at','source','lang','conversation_id','like_count','retweet_count','quote_count','reply_count','type','referenced_tweet_id','mentions','hashtags','urls','author_id','media_keys'])
    _tweets_mutex.release()

    _users_mutex.acquire()
    _users_data[thread] = pd.DataFrame(columns= ['account_id','account_username','account_created_at','account_verified','account_protected','account_location','account_have_profile_image','account_followers_count','account_following_count','account_tweets_count'])
    _users_mutex.release()

    _media_mutex.acquire()
    _media_data[thread] = pd.DataFrame(columns= ['media_key','media_type','media_url','media_view_count'])
    _media_mutex.release()

In [22]:
def __save_data_frames(csv_path: str, thread: str, *, sep=';', escapechar='\\', index = False, header= False, mode = 'a'):
    global _tweets_data, _users_data, _media_data
    
    _tweets_mutex.acquire()
    _tweets_data[thread].to_csv(csv_path + '_tweets.csv', sep= sep, escapechar= escapechar, index= index, header= header, mode= mode)
    _tweets_mutex.release()

    _users_mutex.acquire()
    _users_data[thread].to_csv(csv_path + '_users.csv', sep= sep, escapechar= escapechar, index= index, header= header, mode= mode)
    _users_mutex.release()

    _media_mutex.acquire()
    _media_data[thread].to_csv(csv_path + '_media.csv', sep= sep, escapechar= escapechar, index= index, header= header, mode= mode)
    _media_mutex.release()

    init_client()


In [23]:
def init_variables_coleta(query: str, query_name: str, date: datetime, thread: str):
    global _env, _nome, _env_mutex

    coleta_em_andamento = query_name + date.strftime("%d%m%Y")

    status_info = _env.get(coleta_em_andamento, generate_if_missing = False)
    
    year = date.strftime("%Y")+'/'
    month = date.strftime("%m")+'/'
    day = date.strftime("%d")+'/'

    coleta_query = _workspace + '/Eleicoes_2022_Pesquisa/coleta/hashtags/'+ query_name +'/'
    coleta_year = coleta_query + year
    coleta_month = coleta_year + month
    coleta_day = coleta_month + day

    if not path.exists(coleta_query):
        mkdir(coleta_query)
    if not path.exists(coleta_year):
        mkdir(coleta_year)
    if not path.exists(coleta_month):
        mkdir(coleta_month)
    if not path.exists(coleta_day):
        mkdir(coleta_day)

    if status_info:
        if status_info['next_page_token'] == 'fim':
            return False
        _next_token = status_info['next_page_token']
        # _total_collected = status_info['total_collected']
        __reset_data_frames(thread)
    
    else:
        coleta_data_id = datetime.now().strftime("%H%M%S%d%m%Y")

        status_info = {
            "csv_path": coleta_day + _nome + "_" + date.strftime("%H%M%S%d%m%Y") + "_" + __simplify_query(query) + "_" + coleta_data_id,
            "next_page_token": None,
            # "total_collected": 0,
        }
        
        _env_mutex.acquire()
        _env.set(coleta_em_andamento, status_info, True)
        _env_mutex.release()

        __reset_data_frames(thread)
        __save_data_frames(status_info['csv_path'], thread, header= True)
        
        print(f"thread {(thread.upper() + ':' + query_name):<24}{'START '+ str(date)}")
        
        

    return [coleta_em_andamento, status_info]

In [24]:
# Faz o backup dos dados e salva o estado da coleta
# input: indice do vetor de ids que a proxima coleta deve começar
# output: nada
def backup_state(coleta_em_andamento: str, status_info: dict, thread: str):
    global _env, _env_mutex
    print(f"thread {(thread.upper()+':'):<12}{'BACKUP INICIADO'}")

    # Salva em csv
    __save_data_frames(status_info['csv_path'], thread)

    __reset_data_frames(thread)

    _env_mutex.acquire()
    _env.set(coleta_em_andamento, status_info, True)
    _env_mutex.release()

    print(f"thread {(thread.upper()+':'):<12}{'BACKUP FINALIZADO'}")

In [25]:
# Seleciona os dados do objeto tweet e cooca em um dicionario
# input: objeto tweet, vetor de tweets referenciados
# output: dicionario de tweet
def get_tweet_dict(tweet: tweepy.Tweet):
    # Dados do tweet
    tweet_dict = {
        "id": tweet.id,
        "text": tweet.text,
        "created_at": str(tweet.created_at)[:-6],
        "source": tweet.source,
        "lang": tweet.lang,
        "conversation_id": tweet.conversation_id,
        "like_count": tweet.public_metrics["like_count"],
        "retweet_count": tweet.public_metrics["retweet_count"],
        "quote_count": tweet.public_metrics["quote_count"],        # quote são retweets com comentário
        "reply_count": tweet.public_metrics["reply_count"],        # replies são respostas 
    }

    if tweet.referenced_tweets != None:
        type = []
        referenced_tweet_id = [] 
        
        for referenced_tweet in tweet.referenced_tweets:
            type.append(referenced_tweet.type)
            referenced_tweet_id.append(int(referenced_tweet.id))

        tweet_dict.update({
            "type": str(type)[1:-1].replace("'",""),
            "referenced_tweet_id" : str(referenced_tweet_id)[1:-1].replace("'","")
        })
    else:
        tweet_dict.update({
            "type": "tweeted",
            "referenced_tweet_id" : nan
        })
    
    if tweet.entities != None:
        # Menções 
        try:
            mentions_ids = []

            for mentions in  tweet.entities["mentions"]:
                mentions_ids.append(mentions["id"])
            
            tweet_dict["mentions"] = str(mentions_ids)[1:-1].replace("'","")
            
        except KeyError:
            tweet_dict["mentions"] = nan

        # Hashtags

        try:
            hashtags_tags = []

            for hashtags in tweet.entities["hashtags"]:
                hashtags_tags.append('#' + hashtags["tag"])
            
            tweet_dict["hashtags"] = str(hashtags_tags)[1:-1].replace("'","")
            
        except KeyError:
            tweet_dict["hashtags"] = nan

        # Url

        try:
            urls = []

            for url in tweet.entities["urls"]:
                urls.append(url["url"])
            
            tweet_dict["urls"] = str(urls)[1:-1].replace("'","")
            
        except KeyError:
            tweet_dict["urls"] = nan

    # Dados de autor
    tweet_dict["author_id"] = tweet.author_id

    # Dados de Media
    media_keys = []

    try:
        for media_key in tweet.attachments['media_keys']:
            media_keys.append(media_key)
            
        tweet_dict["media_keys"] = str(media_keys)[1:-1].replace("'","")
        
    except:
        tweet_dict["media_keys"] = nan

    return tweet_dict


In [26]:
# Seleciona os dados do objeto user e cooca em um dicionario
# input: objeto user
# output: dicionario de user
def get_user_dict(user: tweepy.User):
    return {
        'account_id': user.id,
        'account_username': user.username,
        'account_created_at': str(user.created_at)[:-6],
        'account_verified': user.verified,
        'account_protected': user.protected,
        'account_location': user.location if user.location != '' else nan,
        'account_have_profile_image': True if user.profile_image_url != '' else False,
        'account_followers_count': user.public_metrics['followers_count'],
        'account_following_count': user.public_metrics['following_count'], #followings + 
        'account_tweets_count': user.public_metrics['tweet_count'], #tweets + retweets da conta
    }   


In [27]:
# Seleciona os dados do objeto media e cooca em um dicionario
# input: objeto media
# output: dicionario de media
def get_media_dict(media: tweepy.Media):
    media_dict = {
            "media_key": media.media_key,
            "media_type": media.type
    }
                
    if media.type == "photo":
        media_dict.update({
            "media_url": media.url,
            "media_view_count": nan
        })
    else:
        media_dict.update({"media_url": media.preview_image_url})

        try:
            media_dict["media_view_count"] = media.public_metrics["view_count"]
        except TypeError:
            media_dict["media_view_count"] = nan

    return media_dict

    

In [28]:
def process_response(response, thread: str):
    global _tweets_data, _users_data, _media_data
    
    tweet_dict_list = []
    users_dict_list = []
    media_dict_list = []
    
    # Coleta os dicionarios dos dados
    for tweet in response.data:
        tweet_dict_list.append(get_tweet_dict(tweet))

    for user in response.includes["users"]:
        users_dict_list.append(get_user_dict(user))

    try:
        for media in response.includes["media"]:
            media_dict_list.append(get_media_dict(media))
    except KeyError as e:
        pass

    # Concatena os dados anteriormente coletados com os coletados
    _tweets_mutex.acquire()
    _tweets_data[thread] = pd.concat([_tweets_data[thread], pd.DataFrame(tweet_dict_list)], ignore_index = True)
    _tweets_mutex.release()

    _users_mutex.acquire()
    _users_data[thread] = pd.concat([_users_data[thread], pd.DataFrame(users_dict_list)], ignore_index = True)
    _users_mutex.release()

    _media_mutex.acquire()
    _media_data[thread] = pd.concat([_media_data[thread], pd.DataFrame(media_dict_list)], ignore_index = True)
    _media_mutex.release()


In [29]:
def collect_day_tweets_from_date(query: str, query_name: str, date: datetime, thread: str):
    global _backup_interval
    
    backup = False
    backup_time = time.time()

    retorno = init_variables_coleta(query, query_name, date, thread)
        
    if retorno != False:

        coleta_em_andamento = retorno[0]
        status_info = retorno[1]
        
        del retorno
        
        while status_info["next_page_token"] != "fim":
            try:
                response = _client[thread].search_all_tweets(
                    query= query,
                    user_fields= _user_fields,
                    tweet_fields= _tweet_fields,
                    media_fields= _media_fields,
                    expansions= _expansions,
                    max_results= 500,
                    next_token= status_info["next_page_token"],
                    end_time= date,
                    start_time= date - timedelta(days= 1)
                )

            except tweepy.TweepyException as e:
                print(f"thread {(thread.upper()+':'):<12}{e}")
            
                if not backup:
                    backup_state(coleta_em_andamento, status_info, thread)
                    backup = True   
                    
                sleep(5)
                
            except KeyboardInterrupt as e: 
                print(f"thread {(thread.upper()+':'):<12}{e}")

                if not backup:
                    backup_state(coleta_em_andamento, status_info, thread)
                    backup = True   
                
                raise KeyboardInterrupt

            except (ConnectionError, RemoteDisconnected, exceptions.ProtocolError, HTTPException, ConnectionAbortedError, ConnectionRefusedError, ConnectionResetError, TimeoutError) as e:
                print(f"thread {(thread.upper()+':'):<12}{e}")

                if not backup:
                    backup_state(coleta_em_andamento, status_info, thread)
                    backup = True  

                sleep(15)
                continue
            else:

                # status_info["total_collected"] += response.meta["result_count"]
                print(f"thread {(thread.upper()+':'):<12}Coletados no total {response.meta['result_count']} tweets")

                if backup:
                    backup_time = time.time()

                try:
                    status_info["next_page_token"] = response.meta["next_token"]
                except KeyError:
                    status_info["next_page_token"] = "fim"

                if(response.data != None):
                    process_response(response, thread)

                backup = False

                if time.time() - backup_time > _backup_interval:
                    backup_state(coleta_em_andamento, status_info, thread)
                    backup = True
        
        backup_state(coleta_em_andamento, status_info, thread)

In [30]:
def coleta(thread: str):
    ordem = ("query_bolsonaro", "query_lula", "query_ciro", "query_simone", "tweets_bolsonaro", "tweets_lula", "tweets_ciro", "tweets_simone")
    date = datetime(datetime.now().year, datetime.now().month, datetime.now().day)
    today = date

    while True:
        while date > datetime(year= 2022, month= 9, day= 1):
            for query in ordem:
                #print(f"thread {(thread.upper()+':'):<12}{query}")
                collect_day_tweets_from_date(_env.get(query), date - timedelta(days= 1), thread)

            date -= timedelta(days= 1)

            today = datetime(datetime.now().year, datetime.now().month, datetime.now().day)
            if today > date:
                date = today
            else:
                sleep(15)



## Coleta Rede Ativa Superior

### Funções

In [31]:
def init_variables_superior(query:str, query_name: str, date: datetime, thread: str):
    global _tweets_data, _users_data, _media_data, _env, _nome, _env_mutex

    coleta_em_andamento = query_name + date.strftime("%d%m%Y")

    status_info = _env.get(coleta_em_andamento, generate_if_missing = False)
    
    if status_info == None:
        return False

    if status_info["next_page_token"] != "fim":
        return False
    try:
        if status_info["rede_ativa_superior"]:
            return False

    except KeyError:
        status_info.update({"rede_ativa_superior": False})
        
        _env_mutex.acquire()
        _env.set(coleta_em_andamento, status_info, True)
        _env_mutex.release()

        status_info = _env.get(coleta_em_andamento, generate_if_missing = False)

    __reset_data_frames(thread)
    
    print(f"thread {(thread.upper() + ':' + query_name):<24}{'START '+ str(date)}")

    return [coleta_em_andamento, status_info]

In [32]:
def get_referenced_tweets(tweets_df: pd.DataFrame):
    referenced_tweets_ids = {}
    ids = {}
    for line in tweets_df.index:
        tweet = tweets_df.iloc[line]
        
        for referenced_tweet in str(tweet["referenced_tweet_id"]).split(", "):
            referenced_tweets_ids.update({str(referenced_tweet): True})
        
        ids.update({str(tweet['id']): True})

    try:
        referenced_tweets_ids.pop("nan")
    except KeyError:
        pass

    try:
        referenced_tweets_ids.pop("referenced_tweet_id")
    except KeyError:
        pass
    
    referenced_tweets_ids_list = list(referenced_tweets_ids.keys())

    for id in referenced_tweets_ids_list:
        try:
            ids[id]
        except KeyError:
            pass
        else:
            referenced_tweets_ids.pop(id)

    return list(referenced_tweets_ids.keys())


In [33]:
def get_tweets(tweets_ids: list, coleta_em_andamento: str, status_info: dict, thread: str):
    global _backup_interval


    backup = False
    backup_time = time.time()

    begin = 0
    end = 0

    begin = end
    end += 100
    
    if end > len(tweets_ids):
        end = len(tweets_ids)

    # Divide a lista de ids em fatias de 100 ou menores
    while end < len(tweets_ids):
        
        try:
            response = _client[thread].get_tweets(
                ids = tweets_ids[begin : end],
                tweet_fields = _tweet_fields,
                user_fields = _user_fields,
                media_fields = _media_fields,
                expansions = _expansions
            )

        except tweepy.TweepyException as e:
            print(f"thread {(thread.upper()+':'):<12}{e}")
            for i in tweets_ids:
                print(i)
            sleep(15)
            
        except KeyboardInterrupt as e: 
            print(f"thread {(thread.upper()+':'):<12}{e}") 
            
            raise KeyboardInterrupt

        except (ConnectionError, RemoteDisconnected, exceptions.ProtocolError, HTTPException) as e:
            print(f"thread {(thread.upper()+':'):<12}{e}")

            sleep(15)
            
        else:
            print(f"thread {(thread.upper()+':'):<12}{f'Coletados {(end/len(tweets_ids)*100):.2f}% tweets relacionados'}")
            
            process_response(response, thread)

            # Garante as fatias
            begin = end
            end += 100
            
            if end > len(tweets_ids):
                end = len(tweets_ids)

            
    status_info["rede_ativa_superior"] = True
    backup_state(coleta_em_andamento, status_info, thread)
        

In [34]:
def get_rede_ativa_superior(query: str, query_name: str, date: datetime, thread: str):
    retorno = init_variables_superior(query, query_name, date, thread)
    
    if retorno != False:
        coleta_em_andamento = retorno[0]
        status_info = retorno[1]

        del retorno

        if status_info['csv_path'] == False:
            return False

        referenced_tweets = get_referenced_tweets(pd.read_csv(status_info['csv_path'] + '_tweets.csv', sep=';', escapechar= '\\', dtype= _dtype, on_bad_lines= "warn"))
        #referenced_tweets = get_referenced_tweets(pd.read_csv(status_info['csv_path'] + '_tweets.csv', sep=';'))

        get_tweets(referenced_tweets, coleta_em_andamento, status_info, thread)

    
    

In [35]:
def superior(thread: str):
    ordem = ("query_bolsonaro", "query_lula", "query_ciro", "query_simone", "tweets_bolsonaro", "tweets_lula", "tweets_ciro", "tweets_simone")
    date = datetime(datetime.now().year, datetime.now().month, datetime.now().day)
    today = date

    while True:
        while date > datetime(year= 2022, month= 9, day= 1):
            for query in ordem:
                print(f"thread {(thread.upper()+':'):<12}{query}")
                get_rede_ativa_superior(_env.get(query), date - timedelta(days= 1), thread)

            date -= timedelta(days= 1)

            today = datetime(datetime.now().year, datetime.now().month, datetime.now().day)
            if today > date:
                date = today
            else:
                sleep(15)


## Coleta Quotes

In [36]:
def init_variables_interactions(query: str, query_name: str, date: datetime, thread: str):
    global _env, _nome, _env_mutex

    coleta_em_andamento = query_name + date.strftime("%d%m%Y")

    status_info = _env.get(coleta_em_andamento, generate_if_missing = False)

    directory = _workspace + '/Eleicoes_2022_Pesquisa/coleta/hashtags/'+ query_name + '/'
    directory += date.strftime("%Y/%m/%d/") + thread

    if not path.exists(directory):
        mkdir(directory)
    

    # coleta não iniciada
    if status_info == None:
        return False

    # coleta não finalizada
    if status_info["next_page_token"] != "fim":
        return False


    try:
        # rede suaperior não finalizada
        if not status_info["rede_ativa_superior"]:
            return False

    # rede superior não iniciada
    except KeyError:
        return False

    try:
        # quotes não finalizada
        if status_info[thread]["index"] == "fim":
            return False
        else:
            [coleta_em_andamento, status_info]
            __reset_data_frames(thread)
    
    # quotes não iniciada
    except KeyError:

        status_info.update({
            thread: {
                "index": 0,
                "next_page_token": None,
                # "total_collected": 0,
            }
        })

        __reset_data_frames(thread)

        csv_path = status_info['csv_path']
        __save_data_frames((csv_path[:csv_path.rfind('/')+1] + '/'+ thread +'/' + thread), thread, header= True)

    _env_mutex.acquire()
    _env.set(coleta_em_andamento, status_info, True)
    _env_mutex.release()
    
    print(f"thread {(thread.upper() + ':' + query_name):<24}{'START '+ str(date)}")

    return [coleta_em_andamento, status_info]

In [37]:
# Faz o backup dos dados e salva o estado da coleta
# input: indice do vetor de ids que a proxima coleta deve começar
# output: nada
def backup_interactions(coleta_em_andamento: str, status_info: dict, thread: str):
    global _users_data, _env, _env_mutex, _users_mutex
    #print(f"thread {(thread.upper()+':'):<12}{'BACKUP INICIADO'}")

    # Salva em csv
    csv_path = status_info['csv_path']
    __save_data_frames((csv_path[:csv_path.rfind('/')+1] + '/'+ thread +'/' + thread), thread)
    __reset_data_frames(thread)

    _env_mutex.acquire()
    _env.set(coleta_em_andamento, status_info, True)
    _env_mutex.release()

    #print(f"thread {(thread.upper()+':'):<12}{'BACKUP FINALIZADO'}")

In [38]:
def get_quotes(query: str, query_name: str, date: datetime, thread: str):
    retorno = init_variables_interactions(query, query_name, date, thread)

    if retorno != False:
        coleta_em_andamento = retorno[0]
        status_info = retorno[1]

        pd.DataFrame(columns=["account_id", "tweet_id", "interaction_authors", "interaction_ids"]).to_csv(status_info["csv_path"] + "_"+thread+".csv", sep=';', index = False, header= True, mode = 'a')

        finished = False
        while not finished: 

            tweet = pd.read_csv(status_info["csv_path"] + "_tweets.csv", sep=";", escapechar= '\\', dtype= _dtype, skiprows= range(1,status_info[thread]["index"]), nrows=1)

            finished = len(tweet.index) == 0
            if not finished:

                if str(tweet.at[0,"type"])!= "retweeted" and (int(tweet.at[0,"quote_count"]) > 0 if str(tweet.at[0,"type"])!= "type" else False):
                    print(f"thread {(thread.upper()+':'):<12}linha {status_info[thread]['index']}")
                    
                    local_query = "quotes_of_tweet_id:" + str(tweet.at[0,"id"]) + " lang:pt"
                    retorno = get_interactions_tweets(local_query, coleta_em_andamento, status_info, thread)

                    if retorno != False:
                        pd.DataFrame(
                            [{
                                "account_id": str(tweet.at[0,"author_id"]),
                                "tweet_id": str(tweet.at[0,"id"]),
                                "interaction_authors": retorno[1],
                                "interaction_ids": retorno[0]
                            }]
                        ).to_csv(status_info["csv_path"] + "_quotes.csv", sep=';', index = False, header= False, mode = 'a')
                        status_info = retorno[2]

                status_info[thread]["index"] += 1

                backup_interactions(coleta_em_andamento, status_info, thread)
        
        status_info[thread]["index"] = "fim"
        backup_interactions(coleta_em_andamento, status_info, thread)

In [39]:
def get_interactions_tweets(query: str, coleta_em_andamento: str, status_info: dict, thread: str):
    
    while status_info[thread]["next_page_token"] != "fim":
        try:
            response = _client[thread].search_all_tweets(
                query= query,
                user_fields= _user_fields,
                tweet_fields= _tweet_fields,
                media_fields= _media_fields,
                expansions= _expansions,
                max_results= 500,
                next_token= status_info[thread]["next_page_token"],
                start_time= datetime(year= 2022, month= 7, day= 21)
            )

        except tweepy.TweepyException as e:
            print(f"thread {(thread.upper()+':'):<12}{e}")
            sleep(15)
            
        except KeyboardInterrupt as e: 
            print(f"thread {(thread.upper()+':'):<12}{e}")
            raise KeyboardInterrupt

        except (ConnectionError, RemoteDisconnected, exceptions.ProtocolError, HTTPException) as e:
            print(f"thread {(thread.upper()+':'):<12}{e}")
            sleep(15)
        else:

            # status_info[thread]["total_collected"] += response.meta["result_count"]
            print(f"thread {(thread.upper()+':'):<12}Coletados no total {response.meta['result_count']} tweets")
            
            try:
                status_info[thread]["next_page_token"] = response.meta["next_token"]
            except KeyError:
                status_info[thread]["next_page_token"] = "fim"

            if response.data == None:
                status_info[thread]["next_page_token"] = None
                return False
                
            process_response(response, thread)
    
    status_info[thread]["next_page_token"] = None

    return [str(list(_tweets_data[thread]["id"]))[1:-1], str(list(_tweets_data[thread]["author_id"]))[1:-1], status_info]

## Coleta Replies

In [40]:
def get_replies(query: str, query_name:str, date: datetime, thread: str):
    retorno = init_variables_interactions(query, query_name, date, thread)

    if retorno != False:
        coleta_em_andamento = retorno[0]
        status_info = retorno[1]

        pd.DataFrame(columns=["account_id", "tweet_id", "interaction_authors", "interaction_ids"]).to_csv(status_info["csv_path"] + "_"+thread+".csv", sep=';', index = False, header= True, mode = 'a')

        finished = False
        conversation_hash = {}

        while not finished: 
           
            tweet = pd.read_csv(status_info["csv_path"] + "_tweets.csv", sep=";", escapechar= '\\', dtype= _dtype, skiprows= range(1,status_info[thread]["index"]), nrows=1)

            finished = len(tweet.index) == 0
            if not finished:
                if str(tweet.at[0,"type"])!= "retweeted" and (int(tweet.at[0,"reply_count"]) > 0 if str(tweet.at[0,"type"])!= "type" else False):
                    try:
                        conversation_hash[str(tweet.at[0,"conversation_id"])]
                    
                    except KeyError:
                        conversation_hash.update({str(tweet.at[0,"conversation_id"]): True}) 
                    
                        print(f"thread {(thread.upper()+':'):<12}linha {status_info[thread]['index']}")
                        local_query = "conversation_id:" + str(tweet.at[0,"conversation_id"]) + " lang:pt"
                        retorno = get_interactions_tweets(local_query, coleta_em_andamento, status_info, thread)
                
                        if retorno != False:
                            pd.DataFrame(
                                [{
                                    "account_id": str(tweet.at[0,"author_id"]),
                                    "tweet_id": str(tweet.at[0,"id"]),
                                    "interaction_authors": retorno[1],
                                    "interaction_ids": retorno[0]
                                }]
                            ).to_csv(status_info["csv_path"] + "_replies.csv", sep=';', index = False, header= False, mode = 'a')
                            status_info = retorno[2]

            status_info[thread]["index"] += 1

            backup_interactions(coleta_em_andamento, status_info, thread)
        
        status_info[thread]["index"] = "fim"
        backup_interactions(coleta_em_andamento, status_info, thread)

## Coleta Likes

In [41]:
def process_response_intercations(response, thread: str):
    global _users_data
    
    users_dict_list = []

    # Coleta os dicionarios dos dados
    for user in response.data:
        users_dict_list.append(get_user_dict(user))

    # Concatena os dados anteriormente coletados com os coletados 

    _users_mutex.acquire()
    _users_data[thread] = pd.concat([_users_data[thread], pd.DataFrame(users_dict_list)], ignore_index = True)
    _users_mutex.release()

    


In [42]:
def get_interactions(function, id: str, coleta_em_andamento: str, status_info: dict, thread: str):
    
    while status_info[thread]["next_page_token"] != "fim":
        try:
            response = function(
                id= id,
                user_fields= _user_fields,
                max_results= 100,
                pagination_token= status_info[thread]["next_page_token"],
            )

        except tweepy.TweepyException as e:
            print(f"thread {(thread.upper()+':'):<12}{e}")
            sleep(15)
            
        except KeyboardInterrupt as e: 
            print(f"thread {(thread.upper()+':'):<12}{e}")
            raise KeyboardInterrupt

        except (ConnectionError, RemoteDisconnected, exceptions.ProtocolError, HTTPException) as e:
            print(f"thread {(thread.upper()+':'):<12}{e}")
            sleep(15)
        else:
            if response.data == None:
                status_info[thread]["next_page_token"] = None
                return False
            
            # status_info[thread]["total_collected"] += response.meta["result_count"]
            print(f"thread {(thread.upper()+':'):<12}Coletados no total {response.meta['result_count']} tweets")
        
            try:
                status_info[thread]["next_page_token"] = response.meta["next_token"]
            except KeyError:
                status_info[thread]["next_page_token"] = "fim"

            
            process_response_intercations(response, thread)

    status_info[thread]["next_page_token"] = None

    return [str(list(_users_data[thread]["account_id"]))[1:-1], status_info]

In [43]:
def get_likes(query: str, date: datetime, thread: str):
    retorno = init_variables_interactions(query, date, thread)

    if retorno != False:
        coleta_em_andamento = retorno[0]
        status_info = retorno[1]

        pd.DataFrame(columns=["account_id", "tweet_id", "interaction_authors"]).to_csv(status_info["csv_path"] + "_"+thread+".csv", sep=';', index = False, header= True, mode = 'a')

        finished = False
        while not finished: 
            tweet = pd.read_csv(status_info["csv_path"] + "_tweets.csv", sep=";", escapechar= '\\', dtype= _dtype, skiprows= range(1, status_info[thread]["index"]), nrows=1)

            finished = len(tweet.index) == 0
            if not finished:
                if str(tweet.at[0,"type"])!= "retweeted" and (int(tweet.at[0,'like_count']) > 0 if str(tweet.at[0,"type"])!= "type" else False):
                    print(f"thread {(thread.upper()+':'):<12}linha {status_info[thread]['index']}")
                    retorno = get_interactions(_client[thread].get_liking_users, str(tweet.at[0,"id"]), coleta_em_andamento, status_info, thread)

                    if retorno != False:
                        pd.DataFrame(
                            [{
                                "account_id": str(tweet.at[0,"author_id"]),
                                "tweet_id": str(tweet.at[0,"id"]),
                                "interaction_authors": retorno[0],
                            }]
                        ).to_csv(status_info["csv_path"] + "_likes.csv", sep=';', index = False, header= False, mode = 'a')
                        status_info = retorno[1]

            status_info[thread]["index"] += 1

            backup_interactions(coleta_em_andamento, status_info, thread)
        
        status_info[thread]["index"] = "fim"
        backup_interactions(coleta_em_andamento, status_info, thread)

## Retweets

In [44]:
def get_retweets(query: str, date: datetime, thread: str):
    retorno = init_variables_interactions(query, date, thread)

    if retorno != False:
        coleta_em_andamento = retorno[0]
        status_info = retorno[1]

        pd.DataFrame(columns=["account_id", "tweet_id", "interaction_authors"]).to_csv(status_info["csv_path"] + "_"+thread+".csv", sep=';', index = False, header= True, mode = 'a')

        finished = False
        while not finished: 
            tweet = pd.read_csv(status_info["csv_path"] + "_tweets.csv", sep=";", escapechar= '\\', dtype= _dtype, skiprows= range(1, status_info[thread]["index"]), nrows=1)

            finished = len(tweet.index) == 0
            if not finished:
                if str(tweet.at[0,"type"])!= "retweeted" and (int(tweet.at[0,"retweet_count"]) > 0 if str(tweet.at[0,"type"])!= "type" else False):
                    print(f"thread {(thread.upper()+':'):<12}linha {status_info[thread]['index']}")
                    retorno = get_interactions(_client[thread].get_retweeters, str(tweet.at[0,"id"]), coleta_em_andamento, status_info, thread)
                
                    if retorno != False:
                        pd.DataFrame(
                            [{
                                "account_id": str(tweet.at[0,"author_id"]),
                                "tweet_id": str(tweet.at[0,"id"]),
                                "interaction_authors": retorno[0],
                            }]
                        ).to_csv(status_info["csv_path"] + "_retweets.csv", sep=';', index = False, header= False, mode = 'a')
                        status_info = retorno[1]

            status_info[thread]["index"] += 1

            backup_interactions(coleta_em_andamento, status_info, thread)
        
        status_info[thread]["index"] = "fim"
        backup_interactions(coleta_em_andamento, status_info, thread)

In [45]:
def get_retweets2(query: str, date: datetime, thread: str):
    retorno = init_variables_interactions(query, date, thread)

    if retorno != False:
        coleta_em_andamento = retorno[0]
        status_info = retorno[1]

        pd.DataFrame(columns=["account_id", "tweet_id", "interaction_authors", "interaction_ids"]).to_csv(status_info["csv_path"] + "_"+thread+".csv", sep=';', index = False, header= True, mode = 'a')

        finished = False
        while not finished: 

            tweet = pd.read_csv(status_info["csv_path"] + "_tweets.csv", sep=";", escapechar= '\\', dtype= _dtype, skiprows= range(1,status_info[thread]["index"]), nrows=1)
            finished = len(tweet.index) == 0
            if not finished:

                if str(tweet.at[0,"type"])!= "retweeted" and (int(tweet.at[0,"retweet_count"]) > 0 if str(tweet.at[0,"type"])!= "type" else False):
                    print(f"thread {(thread.upper()+':'):<12}linha {status_info[thread]['index']}")
                    
                    local_query = "retweets_of_tweet_id:" + str(tweet.at[0,"id"]) + " lang:pt"
                    retorno = get_interactions_tweets(local_query, coleta_em_andamento, status_info, thread)

                    if retorno != False:
                        pd.DataFrame(
                            [{
                                "account_id": str(tweet.at[0,"author_id"]),
                                "tweet_id": str(tweet.at[0,"id"]),
                                "interaction_authors": retorno[1],
                                "interaction_ids": retorno[0]
                            }]
                        ).to_csv(status_info["csv_path"] + "_retweets.csv", sep=';', index = False, header= False, mode = 'a')
                        status_info = retorno[2]

                status_info[thread]["index"] += 1

                backup_interactions(coleta_em_andamento, status_info, thread)

        status_info[thread]["index"] = "fim"
        backup_interactions(coleta_em_andamento, status_info, thread)

In [46]:
def get_retweets3(query: str, date: datetime, thread: str):
    retorno = init_variables_interactions(query, date, thread)

    if retorno != False:
        coleta_em_andamento = retorno[0]
        status_info = retorno[1]

        pd.DataFrame(columns=["account_id", "tweet_id", "interaction_authors", "interaction_ids"]).to_csv(status_info["csv_path"] + "_"+thread+".csv", sep=';', index = False, header= True, mode = 'a')

        tweets = pd.read_csv(status_info["csv_path"] + "_tweets.csv", sep=";", escapechar= '\\', dtype= _dtype)

        finished = False
        while not finished: 
            tweet = tweets.iloc[status_info[thread]["index"], :]

            finished = len(tweets.index) < status_info[thread]["index"] 

            if not finished:

                if str(tweet.type)!= "retweeted" and (int(tweet.retweet_count) > 0 if str(tweet.type)!= "type" else False):
                    print(f"thread {(thread.upper()+':'):<12}linha {status_info[thread]['index']}")
                    
                    local_query = "retweets_of_tweet_id:" + str(tweet.id) + " lang:pt"
                    retorno = get_interactions_tweets(local_query, coleta_em_andamento, status_info, thread)

                    if retorno != False:
                        pd.DataFrame(
                            [{
                                "account_id": str(tweet.author_id),
                                "tweet_id": str(tweet.id),
                                "interaction_authors": retorno[1],
                                "interaction_ids": retorno[0]
                            }]
                        ).to_csv(status_info["csv_path"] + "_retweets.csv", sep=';', index = False, header= False, mode = 'a')
                        status_info = retorno[2]

                status_info[thread]["index"] += 1

                backup_interactions(coleta_em_andamento, status_info, thread)
        
        status_info[thread]["index"] = "fim"
        backup_interactions(coleta_em_andamento, status_info, thread)

## Execução


In [47]:
init_environment("relative")

In [48]:
_token_farm = True
init_client()

_backup_interval = 450

In [49]:
# "jair OR bolsonaro OR bozo OR biroliro OR bonoro OR tchutchuca do centrao OR capitao OR genocida OR mito OR bolsomito OR bolsolixo OR bolsotrump OR messias OR patriota OR b22 OR b17 OR forabolsonaro OR #elesim OR #elenao OR mentiroso da republica OR #bolsonaronoprimeiroturno lang: pt"

_env.set("query_bolsonaro", '(#bolsonaro OR jair OR bolsonaro OR bozo OR biroliro OR "tchutchuca do centrao" OR bonoro OR capitao OR genocida OR mito OR bolsomito OR bolsolixo OR bolsotrump OR messias OR patriota OR b22 OR b17 OR brocha OR imbrochavel OR maçonaro) lang:pt -is:retweet', False)

_env.set("query_lula",'(#lula OR lula OR "ex presidiario" OR lulalivre OR "9 dedos" OR luladrao OR lulaladrao OR lulinha OR nine OR luis inacio OR cachaceiro OR "sapo barbudo" OR lulao OR l13 OR "faz o L" OR lulindo OR metalurgico OR lulalkimin) -loud lang:pt -is:retweet', False)

_env.set("query_ciro", '(#ciro OR ciro OR c12 OR cirogomes OR "ciro gomes" OR "correu pra paris" OR bolsolula OR ciranha) lang:pt -is:retweet', False)

_env.set("query_simone", '(#simone OR "simone tebet" OR simonetebet OR tebet OR "simone tablet" OR estepe OR s15) lang:pt -is:retweet', False)

_env.set("perfil_bolsonaro", 'from:jairbolsonaro lang:pt', False)

_env.set("perfil_lula",'from:LulaOficial lang:pt', False)

_env.set("perfil_ciro", 'from:cirogomes lang:pt', False)

_env.set("perfil_simone", 'from:simonetebetbr lang:pt', False)

_env.set("numero_bolsonaro", '22 lang:pt -is:retweet', False)

_env.set("numero_lula",'13 lang:pt -is:retweet', False)

_env.set("pos_eleicao", '("intervenção militar" OR "intervenção federal" OR "alexandre de morais" OR xandão OR fraude OR Venezuela OR Cuba OR urna OR urnas OR comunismo) lang:pt -is:retweet', False)

_env.set("atos_golpistas", '("Festa da Selma" OR ato OR bolsonarista OR golpe OR golpista OR baderna OR extremista OR Brasília OR "três poderes" OR invasão OR "ocupar congresso" OR "atos terroristas" OR manifestção OR atentado OR patriotas OR "tomada do poder" OR guerra OR "esplanada dos ministérios" OR "congresso nacional" OR "manifestantes" OR "retomada do poder") -ucrania lang:pt -is:retweet')

_env.set("retweets_query_bolsonaro", 'is:retweet (#bolsonaro OR jair OR bolsonaro OR bozo OR biroliro OR "tchutchuca do centrao" OR bonoro OR capitao OR genocida OR mito OR bolsomito OR bolsolixo OR bolsotrump OR messias OR patriota OR b22 OR b17 OR brocha OR imbrochavel OR maçonaro) lang:pt ', False)

_env.set("retweets_query_lula",'is:retweet (#lula OR lula OR "ex presidiario" OR lulalivre OR "9 dedos" OR luladrao OR lulaladrao OR lulinha OR nine OR luis inacio OR cachaceiro OR "sapo barbudo" OR lulao OR l13 OR "faz o L" OR lulindo OR metalurgico OR lulalkimin) -loud lang:pt ', False)

_env.set("retweets_query_ciro", 'is:retweet (#ciro OR ciro OR c12 OR cirogomes OR "ciro gomes" OR "correu pra paris" OR bolsolula OR ciranha) lang:pt', False)

_env.set("retweets_query_simone", 'is:retweet (#simone OR "simone tebet" OR simonetebet OR tebet OR "simone tablet" OR estepe OR s15) lang:pt', False)

_env.set("retweets_atos_golpistas", 'is:retweet ("Festa da Selma" OR ato OR bolsonarista OR golpe OR golpista OR baderna OR extremista OR Brasília OR "três poderes" OR invasão OR "ocupar congresso" OR "atos terroristas" OR manifestção OR atentado OR patriotas OR "tomada do poder" OR guerra OR "esplanada dos ministérios" OR "congresso nacional" OR "manifestantes" OR "retomada do poder") -ucrania lang:pt ')

_env.set("retweets_numero_bolsonaro", '22 lang:pt is:retweet', False)

_env.set("retweets_numero_lula",'13 lang:pt is:retweet', False)

_env.set("retweets_pos_eleicao", 'is:retweet ("intervenção militar" OR "intervenção federal" OR "alexandre de morais" OR xandão OR fraude OR Venezuela OR Cuba OR urna OR urnas OR comunismo) lang:pt', False)

In [50]:
ordem = ("query_bolsonaro", "query_lula", "query_ciro", "query_simone", "perfil_bolsonaro", "perfil_lula", "perfil_ciro", "perfil_simone", "numero_bolsonaro", "numero_lula", 'pos_eleicao','atos_golpistas')
perfis = ("tweets_bolsonaro", "tweets_lula", "tweets_ciro", "tweets_simone")
retweets_queries = ("retweets_query_bolsonaro", "retweets_query_lula", "retweets_query_ciro", "retweets_query_simone", 'retweets_atos_golpistas', 'retweets_numero_bolsonaro', 'retweets_numero_lula', 'retweets_pos_eleicao')

In [51]:
def query_law(query: str, date: datetime) -> bool:
    global ordem, perfis, retweets_queries

    number_queires = ("numero_bolsonaro","numero_lula","retweets_numero_bolsonaro","retweets_numero_lula")
    pos_eleicao = ("pos_eleicao", 'retweets_pos_eleicao')
    atos_golpistas = ('atos_golpistas', 'retweets_atos_golpistas')
    
    return (
        (
            query in number_queires
            and date > datetime(year= 2022, month= 10, day= 1) 
            and date < datetime(year= 2022, month= 10, day= 31)
        ) or 
        ((query in ordem[:8] or query in retweets_queries[:4]) and date < datetime(year= 2023, month= 2, day= 1)) or
        (query in pos_eleicao and date > datetime(year= 2022, month= 10, day= 30) and date <= datetime(year= 2023, month= 2, day= 24)) or
        (query in atos_golpistas and date >= datetime(year= 2022, month= 12, day= 31) and date <= datetime(year= 2023, month= 2, day= 12))
    )

In [52]:
query_law('retweets_lula', datetime(year= 2023, month= 1, day= 1))

False

In [53]:
def contagem(date: datetime):
    global ordem, retweets_queries
    
    tweets_count = [0,0,0,0,0,0,0,0,0,0,0,0,0]
    retweets_count = [0,0,0,0,0,0,0,0,0]

    while date > datetime(year= 2022, month= 7, day= 19):
        print(f'Dia: {date}')

        for index, query in enumerate(ordem):
            try:
                coleta_em_andamento = __simplify_query(_env.get(query, generate_if_missing = False)) + date.strftime("%d%m%Y")

                status_info = _env.get(coleta_em_andamento, generate_if_missing = False)
                
                df = pd.read_csv(status_info['csv_path'] + '_tweets.csv', sep=';', escapechar= '\\', dtype= _dtype)
            except:
                pass
            else:

                counts = df["type"].value_counts()

                try:
                    counts = counts.drop("retweeted")
                except KeyError:
                    pass

                try:
                    counts = counts.drop("type")
                except KeyError:
                    pass

                for i in counts:
                    tweets_count[index] += i
                    tweets_count[-1] += i


        for index, query in enumerate(retweets_queries):
            try:
                coleta_em_andamento = __simplify_query(_env.get(query, generate_if_missing = False)) + date.strftime("%d%m%Y")

                status_info = _env.get(coleta_em_andamento, generate_if_missing = False)
                
                df = pd.read_csv(status_info['csv_path'] + '_tweets.csv', sep=';', escapechar= '\\', dtype= _dtype)
            except:
                pass
            else:

                counts = df["type"].value_counts()

                for i in counts:
                    retweets_count[index] += i
                    retweets_count[-1] += i


        date -= timedelta(days= 1)

    contagem = try_loadfrom_drive(open)('contagem.txt', 'w')
    for i, query in enumerate(ordem):
        contagem.write(f"{query}: {tweets_count[i]:,} tweets coletados\n")

    contagem.write(f"total tweets: {tweets_count[-1]:,} tweets coletados\n")
    for i, query in enumerate(retweets_queries):
        contagem.write(f"{query}: {retweets_count[i]:,} retweets coletados\n")

    contagem.write(f"total retweets: {retweets_count[-1]:,} retweets coletados\n")
    contagem.write(f"total: {tweets_count[-1] + retweets_count[-1]:,} tweets and retweets coletados\n")

    contagem.close()

In [54]:
def coleta_diaria(date: datetime):
    global ordem
    
    date_aux = date

    while date > datetime(year= 2022, month= 7, day= 20):
            
        thread = 'coleta'
        for query in ordem:
            if (query_law(query,date)):        
                print(f"thread {(thread.upper()+':'):<12}{query}")
                collect_day_tweets_from_date(_env.get(query), query, date - timedelta(days= 1), thread)

        thread = 'superior'
        for query in ordem:
            print(f"thread {(thread.upper()+':'):<12}{query}")
            get_rede_ativa_superior(_env.get(query), query, date - timedelta(days= 1), thread)

        date -= timedelta(days= 1)

    contagem(date_aux)

    

In [55]:
def retweets(query: list):

    date = datetime(year= 2022, month= 7, day= 20)

    while date < (datetime(datetime.now().year, datetime.now().month, datetime.now().day) - timedelta(days= 1)):
            
        thread = 'retweets'
        
        for q in query:
            if (query_law(q,date)):        
                print(f"thread {(thread.upper()+':'):<12}{q}")
                collect_day_tweets_from_date(_env.get(q), q, date, thread)

        date += timedelta(days= 1)
    

In [56]:
def likes():
    global ordem
    
    date = datetime(year= 2022, month= 7, day= 20)


    while date < datetime(datetime.now().year, datetime.now().month, datetime.now().day):
        
        thread = 'likes'
        for query in ordem:
            if (query_law(query,date)):
                print(f"thread {(thread.upper()+':'):<12}{query}")
                get_likes(_env.get(query), date, thread)

        date += timedelta(days= 1)

In [57]:
def quotesReplies():
    global perfis
    
    date = datetime(year= 2022, month= 7, day= 20)


    while date < datetime(datetime.now().year, datetime.now().month, datetime.now().day):
        
        thread = 'quotes'
        for query in perfis:
            print(f"thread {(thread.upper()+':'):<12}{query}")
            get_quotes(_env.get(query), date, thread)

        thread = 'replies'
        for query in perfis:
            print(f"thread {(thread.upper()+':'):<12}{query}")
            get_replies(_env.get(query), date, thread)

        date += timedelta(days= 1)

In [58]:
def main():
    threads = []

    threads.append(Thread(target= retweets))
    #threads.append(Thread(target= likes))
    threads.append(Thread(target= quotesReplies))
    
    for t in threads:
        t.start()

    today = datetime(datetime.now().year, datetime.now().month, datetime.now().day) - timedelta(days= 1)

    while True:
        if today < datetime(datetime.now().year, datetime.now().month, datetime.now().day):
            today = datetime(datetime.now().year, datetime.now().month, datetime.now().day)
            #coleta_diaria(today)
        sleep(60)
    

In [59]:
def rebuildJSON():
    global ordem, _env, _workspace


    date = datetime(year= 2022, month= 7, day= 20)
    while date < datetime(datetime.now().year, datetime.now().month, datetime.now().day):

        for q in (ordem + retweets_queries):
            if (query_law(q, date)):
                query = _env.get(q, generate_if_missing = False)
                directory = _workspace + '/Eleicoes_2022_Pesquisa/coleta/hashtags/'+ q +'/'
                directory += date.strftime("%Y/%m/%d")

                try:
                    tweets_file_dir = listdir(directory)
                except:
                    pass
                else:
                    #csv_path
                    csv_path = directory + '/' + tweets_file_dir[-2][:tweets_file_dir[-2].rfind('_tweets')]
                    
                    status_info = {
                        "csv_path": csv_path,
                        "next_page_token": 'fim',
                        "rede_ativa_superior": True,
                    }

                    for thread in ('quotes', 'replies'):
                        if thread in tweets_file_dir:
                            status_info.update({
                                thread: {
                                    "index": 'fim',
                                    "next_page_token": None,
                                }
                            })
                        #
                    #

                    coleta_em_andamento = q + date.strftime("%d%m%Y")
                    _env.set(coleta_em_andamento, status_info, True)
                #
            #
        #
        date += timedelta(days= 1)
    #
#

In [60]:
def testeJSON():
    date = datetime(year= 2022, month= 7, day= 20)
    
    while date < datetime(datetime.now().year, datetime.now().month, datetime.now().day):

        for q in (ordem + retweets_queries):
            if (query_law(q, date)):
                query = _env.get(q, generate_if_missing = False)
                coleta_em_andamento = q + date.strftime("%d%m%Y")

                status_info = _env.get(coleta_em_andamento, generate_if_missing = False)

                print(f'{status_info != False}: {q:>10}_{date.strftime("%d/%m/%Y"):>10}')
            #
        #
        date += timedelta(days= 1)
    #

In [61]:
def get_dataframe(date: datetime, query_key: str):
    global _env

    query = _env.get(query_key, generate_if_missing = False)
    coleta_em_andamento = __simplify_query(query) + date.strftime("%d%m%Y")
    status_info = _env.get(coleta_em_andamento, generate_if_missing = False)
    
    return pd.read_csv(status_info['csv_path'] + '_tweets.csv', sep=';', escapechar= '\\', dtype= _dtype)

In [62]:
def set_dataframe(df: pd.DataFrame, date: datetime, query_key: str):
    global _env

    query = _env.get(query_key, generate_if_missing = False)
    coleta_em_andamento = __simplify_query(query) + date.strftime("%d%m%Y")
    status_info = _env.get(coleta_em_andamento, generate_if_missing = False)

    df.to_csv(status_info['csv_path'] + '_tweets.csv', sep= ';', escapechar= '\\', index = False)

In [63]:
import sys

if not sys.warnoptions:
    import os, warnings
    warnings.simplefilter("ignore") # Change the filter in this process
    os.environ["PYTHONWARNINGS"] = "default" # Also affect subprocesses

#rebuildJSON()
#testeJSON()

#coleta_diaria(datetime(datetime.now().year, datetime.now().month, datetime.now().day) - timedelta(days= 1))
retweets(['retweets_query_bolsonaro', 'retweets_query_lula', 'retweets_pos_eleicao'])

thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thread RETWEETS:   retweets_query_bolsonaro
thread RETWEETS:   retweets_query_lula
thre

KeyboardInterrupt: 