## Create bulls/bears datasets

In [131]:
import json
import datetime
import pandas as pd
import numpy as np
import gc
import glob
import json
import os
import re
import html
import preprocessor as p
import mysql.connector as sql
from sshtunnel import SSHTunnelForwarder
from sqlalchemy import create_engine
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [132]:
import traceback
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [133]:
pd.options.display.float_format = '{:.6f}'.format
pd.options.display.max_colwidth = 400
p.set_options(p.OPT.URL, p.OPT.MENTION)
analyzer = SentimentIntensityAnalyzer()

In [134]:
ssh_username = os.getenv('AUTOMLPREDICTOR_DB_SSH_USER')
ssh_password = os.getenv('AUTOMLPREDICTOR_DB_SSH_PASSWORD')

db_host = os.getenv('AUTOMLPREDICTOR_DB_SERVER_IP', '127.0.0.1')
db_user = 'root'
db_password = os.getenv('AUTOMLPREDICTOR_DB_SQL_PASSWORD')
db_port = 3306
db = 'automlpredictor_db_dashboard'
use_ssh = True

In [135]:
data_path='./data/twitter'

In [136]:
def get_friends_followers(leaders_path, friends_path, followers_path, intersect=True):
    df_leaders = pd.read_json(leaders_path, lines=True, orient=str)
    df_leaders['id'] = df_leaders[['id']].astype(np.int64)
    
    df_friends = pd.read_json(friends_path, lines=True, orient=str) # .reset_index(drop=True).set_index(['id'])
    df_friends['id'] = df_friends[['id']].astype(np.int64)
    
    df_followers = pd.read_json(followers_path, lines=True, orient=str)# .reset_index(drop=True).set_index(['id'])
    
    if intersect:
        df_followers = df_followers.reset_index(drop=True).set_index(['id'])
        
        df_merge = pd.merge(
            df_friends[['id']], df_followers, left_on=['id'], 
            right_index=True, how='inner', sort=False
        )
    else:
        df_merge = pd.concat([df_friends, df_followers]).drop_duplicates()
    
    df_union = pd.concat([df_merge, df_leaders]).drop_duplicates().reset_index(drop=True)#.set_index(['id'])
    return df_union

In [137]:
df_bears = get_friends_followers(f'{data_path}/bears_20180918.json',
                                 f'{data_path}/bears_friends_20180916.json', 
                                 f'{data_path}/bears_followers_20180914.json', False)

In [138]:
df_bulls = get_friends_followers(f'{data_path}/bulls_20180918.json',
                                 f'{data_path}/bulls_friends_20180915.json', 
                                 f'{data_path}/bulls_followers_20180916.json', False)    

In [139]:
bears_ids = set(df_bears['id'].values)
df_bears['mixed'] = df_bears['id'].apply(lambda x: x in bulls_ids)

In [140]:
df_bears = df_bears[~df_bears['mixed']]

In [141]:
df_bears = df_bears[['id', 'name', 'screen_name']]

In [144]:
df_bulls.to_json(
    f'{data_path}/extended_bulls_20180918.json', orient='records', lines=True
)

In [145]:
df_bears.to_json(
    f'{data_path}/extended_bears_20180918.json', orient='records', lines=True
)

In [146]:
df_bulls_bears = pd.concat(
    [df_bulls, df_bears], keys=['Bullish', 'Bearish',]
).reset_index().drop(['level_1'], axis=1).rename(
    {'level_0': 'sentiment'}, axis=1
).set_index(['screen_name'])

In [147]:
# df_bulls_bears

## Retrieve the posts to enrich

In [148]:
def query(use_ssh, q, db_host, db_user, db_password, db_port, db, ssh_username, ssh_password, charset='utf8mb4'):

    if use_ssh:
        with SSHTunnelForwarder(
                ssh_address_or_host=(db_host, 22),
                ssh_password=ssh_password,
                ssh_username=ssh_username,
                remote_bind_address=('127.0.0.1', db_port)
        ) as server:
            conn = sql.connect(host='127.0.0.1',
                               port=server.local_bind_port,
                               user=db_user,
                               passwd=db_password,
                               db=db,
                               charset=charset)
            response = pd.read_sql_query(q, conn)
            conn.close()
            return response
    else:
        conn = sql.connect(host=db_host,
                           port=db_port,
                           user=db_user,
                           passwd=db_password,
                           db=db,
                           charset=charset)
        response = pd.read_sql_query(q, conn)
        conn.close()
        return response

In [149]:
sql_posts_to_enrich = """
SELECT post_id, post_type, body, impact, link, user_name, 
created_at_epoch_ms, client_received_epoch_ms, 
sentiment_ml_model as original_ml_sentiment, 
sentiment_vader_normalized, 
sentiment_mixed as original_sentiment_mixed
FROM automlpredictor_db_dashboard.analysis_posts_sentiment
 WHERE created_at_epoch_ms >=(SELECT UNIX_TIMESTAMP(NOW())*1000-(14*24*3600000)) 
 AND post_type in ('twitter-topic', 'twitter-user');
"""

df_posts_to_enrich = query(
    use_ssh, sql_posts_to_enrich, db_host, db_user, db_password, db_port, db, 
    ssh_username, ssh_password
)


2018-09-18 20:15:07,506 - paramiko.transport - INFO - Connected (version 2.0, client OpenSSH_7.2p2)
2018-09-18 20:15:07,715 - paramiko.transport - INFO - Authentication (publickey) failed.
2018-09-18 20:15:07,756 - paramiko.transport - INFO - Connected (version 2.0, client OpenSSH_7.2p2)
2018-09-18 20:15:07,932 - paramiko.transport - INFO - Authentication (publickey) successful!


In [150]:
df_posts_to_enrich.to_json(
    f'{data_path}/twitter_posts_pre_enrichment_20180918.json.gz', 
    orient='records', lines=True, compression='gzip'
)

In [151]:
df_posts_to_enrich = pd.read_json(
    f'{data_path}/twitter_posts_pre_enrichment_20180918.json.gz', 
    lines=True, orient=str, compression='gzip'
)

In [184]:
# df_posts_to_enrich

In [172]:
df_enriched = pd.merge(
    df_posts_to_enrich, df_bulls_bears, left_on=['user_name'], 
    right_index=True, how='inner', sort=False
)

In [183]:
# df_enriched

In [174]:
re1 = re.compile(r'  +')

def fixup(x):
    x = p.clean(x)
    
    x = x.replace('&#', "#").replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [175]:
df_enriched['clean_body'] = df_enriched['body'].apply(lambda body: fixup(body))

In [176]:
df_enriched=df_enriched.reset_index().rename(index=str, columns={'index':'id'})

In [177]:
df_enriched['repeat']=(df_enriched['impact']/100 + 1).astype(np.int32)

In [178]:
df_enriched['repeat']

0         1
1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
13        1
14        1
15       35
16        1
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        1
25        1
26        1
27        1
28        1
29        1
         ..
23254     1
23255     1
23256     1
23257     1
23258     1
23259     2
23260     1
23261     1
23262     1
23263     1
23264     1
23265     1
23266     1
23267     1
23268     1
23269     1
23270     1
23271     1
23272     1
23273     1
23274     1
23275     1
23276     1
23277     1
23278     1
23279     1
23280     1
23281     1
23282     1
23283     1
Name: repeat, Length: 23284, dtype: int32

In [181]:
df_enriched_repeated = pd.DataFrame(np.repeat(df_enriched.values, df_enriched['repeat'].tolist(), axis=0))

In [182]:
df_enriched_repeated.to_json(
    f'{data_path}/twitter_posts_post_enriched_repeated_20180901_20180918.json.gz', 
    orient='records', lines=True, compression='gzip'
)

In [78]:
df_enriched_unknown_user = pd.merge(
    df_posts_to_enrich, df_bulls_bears, left_on=['user_name'], 
    right_index=True, how='left', sort=False
)

In [95]:
df_enriched_unknown_user[
    df_enriched_unknown_user['id'].isnull()
]['user_name'].value_counts().rename_axis('screen_name').reset_index(name='counts').to_json(
    f'{data_path}/twitter_unknown_users_20180901_20180918.json', 
    orient='records', lines=True
)