# Moviendo Inferencia TW a Producción

# Formular la pregunta

> "Cuál es la probabilidad que un tweet que origina dentro de los coordinados geográficos (-78.31, 0.44, -70.71, 11.39) contiene al menos una ocurrencia de la palabra 'yo' con cualquier composición de acentos y letras minúsculas y mayúsculas."

# Recopilar datos

In [1]:
import os

import tweepy
from tweepy import Stream

print('The libraries was imported!')


The libraries was imported!


### Cargar credenciales

In [2]:
CONSUMER_KEY = os.getenv('TWITTER_CONSUMER_KEY_DATA_SCIENCE')
CONSUMER_SECRET = os.getenv('TWITTER_CONSUMER_SECRET_DATA_SCIENCE')
ACCESS_TOKEN = os.getenv('TWITTER_ACCESS_TOKEN_DATA_SCIENCE')
ACCESS_TOKEN_SECRET = os.getenv('TWITTER_ACCESS_TOKEN_SECRET_DATA_SCIENCE')

In [3]:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

### Establecer ubicación (COL)

In [4]:
COLOMBIA_GEO_LOCATION_BOUNDING_BOX = [-78.31, 0.44, -70.71, 11.39]

### Definir funciones de limpieza

In [5]:
from unidecode import unidecode

In [6]:
def make_lowercase(tweet):
    return tweet.lower()


def remove_diacritics(tweet):
    return unidecode(tweet)


def remove_non_alpha_characters(tweet):
    return ''.join(character for character in tweet if character.isalpha() or character == ' ')

### Establecer una conexión al API de Twitter

### Definición de la base de datos

jbeltranleon=# create database twitter_inference;

jbeltranleon=# \c twitter_inference

twitter_inference=# CREATE TABLE tweets (id_str VARCHAR(50), text VARCHAR(200));


In [7]:
DATA_BASE_NAME = 'twitter_inference'
TABLE_NAME = 'tweets'
USER = 'jbeltranleon'
HOST = 'localhost'
PASSWORD = ''

In [8]:
import psycopg2
from tweepy import StreamListener

In [9]:
class PersistedStreamListener(StreamListener):
    def __init__(self):
        self._database_connection = psycopg2.connect(
            dbname=DATA_BASE_NAME,
            user=USER,
            host=HOST,
            password=PASSWORD
        )
        
        super().__init__()
        
    def on_status(self, status):
        cleaned_status_text  = self._clean_status_text(status.text)
        self._insert_status(id_str=status.id_str, text=cleaned_status_text)
        
    def _clean_status_text(self, status_text):
        cleaned_status_text = status_text
        for cleaning_function in self._cleaning_functions:
            cleaned_status_text = cleaning_function(cleaned_status_text)
        return cleaned_status_text
    
    def _insert_status(self, id_str, text):
        cursor = self._database_connection.cursor()
        insert_statement = """INSERT INTO {table_name}(id_str, text) VALUES ('{id_str}', '{text}')""".format(
            table_name = TABLE_NAME,
            id_str = id_str,
            text = text
        )
        
        cursor.execute(insert_statement)
        self._database_connection.commit()
        
        cursor.close()
        
    @property
    def _cleaning_functions(self):
        return [make_lowercase, remove_diacritics, remove_non_alpha_characters]

### Obtener Tweets del firehouse

In [10]:
streaming_api = Stream(auth=auth, listener=PersistedStreamListener()) 

In [11]:
streaming_api.filter(locations=COLOMBIA_GEO_LOCATION_BOUNDING_BOX, async=True)

In [12]:
import numpy as np
from scipy.stats import beta as beta_distribution

In [13]:
X_VALUES = np.linspace(0, 1, 1002)[1:-1]
DATABASE_CONNECTION = psycopg2.connect(dbname=DATA_BASE_NAME, user=USER, host=HOST, password=PASSWORD)
KEYWORD = 'cambio'

In [14]:
def fetch_tweets(database_connection=DATABASE_CONNECTION):
    cursor = database_connection.cursor()
    select_statement = """SELECT text FROM {table}""".format(table=TABLE_NAME)
    cursor.execute(select_statement)
    result = cursor.fetchall()
    
    return [tweet[0] for tweet in result]
