## Create bulls/bears datasets

In [1]:
import json
import datetime
import pandas as pd
import numpy as np
import gc
import glob
import json
import os
import re
import html
import preprocessor as p
import mysql.connector as sql
from sshtunnel import SSHTunnelForwarder
from sqlalchemy import create_engine
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
import traceback
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
pd.options.display.float_format = '{:.6f}'.format
pd.options.display.max_colwidth = 400
p.set_options(p.OPT.URL, p.OPT.MENTION)
analyzer = SentimentIntensityAnalyzer()

In [4]:
ssh_username = os.getenv('AUTOMLPREDICTOR_DB_SSH_USER')
ssh_password = os.getenv('AUTOMLPREDICTOR_DB_SSH_PASSWORD')

db_host = os.getenv('AUTOMLPREDICTOR_DB_SERVER_IP', '127.0.0.1')
db_user = 'root'
db_password = os.getenv('AUTOMLPREDICTOR_DB_SQL_PASSWORD')
db_port = 3306
db = 'automlpredictor_db_dashboard'
use_ssh = True

In [5]:
data_path='./data/twitter'

In [6]:
def get_friends_followers(leaders_path, friends_path, followers_path, intersect=True):
    df_leaders = pd.read_json(leaders_path, lines=True, orient=str)
    df_leaders['id'] = df_leaders[['id']].astype(np.int64)
    
    df_friends = pd.read_json(friends_path, lines=True, orient=str) # .reset_index(drop=True).set_index(['id'])
    df_friends['id'] = df_friends[['id']].astype(np.int64)
    
    df_followers = pd.read_json(followers_path, lines=True, orient=str)# .reset_index(drop=True).set_index(['id'])
    
    if intersect:
        df_followers = df_followers.reset_index(drop=True).set_index(['id'])
        
        df_merge = pd.merge(
            df_friends[['id']], df_followers, left_on=['id'], 
            right_index=True, how='inner', sort=False
        )
    else:
        df_merge = pd.concat([df_friends, df_followers]).drop_duplicates()
    
    df_union = pd.concat([df_merge, df_leaders]).drop_duplicates().reset_index(drop=True)#.set_index(['id'])
    return df_union

In [137]:
df_bears = get_friends_followers(f'{data_path}/bears_20180918.json',
                                 f'{data_path}/bears_friends_20180916.json', 
                                 f'{data_path}/bears_followers_20180914.json', False)

In [138]:
df_bulls = get_friends_followers(f'{data_path}/bulls_20180918.json',
                                 f'{data_path}/bulls_friends_20180915.json', 
                                 f'{data_path}/bulls_followers_20180916.json', False)    

In [139]:
bears_ids = set(df_bears['id'].values)
df_bears['mixed'] = df_bears['id'].apply(lambda x: x in bulls_ids)

In [140]:
df_bears = df_bears[~df_bears['mixed']]

In [141]:
df_bears = df_bears[['id', 'name', 'screen_name']]

In [144]:
df_bulls.to_json(
    f'{data_path}/extended_bulls_20180918.json', orient='records', lines=True
)

df_bears.to_json(
    f'{data_path}/extended_bears_20180918.json', orient='records', lines=True
)

In [8]:
df_bulls = pd.read_json(
    f'{data_path}/extended_bulls_20180918.json', 
    lines=True, orient=str
)

df_bears = pd.read_json(
    f'{data_path}/extended_bears_20180918.json', 
    lines=True, orient=str
)

In [9]:
df_bulls_bears = pd.concat(
    [df_bulls, df_bears], keys=['Bullish', 'Bearish',]
).reset_index().drop(['level_1'], axis=1).rename(
    {'level_0': 'sentiment'}, axis=1
).set_index(['screen_name'])

In [10]:
# df_bulls_bears

## Retrieve the posts to enrich

In [11]:
def query(use_ssh, q, db_host, db_user, db_password, db_port, db, ssh_username, ssh_password, charset='utf8mb4'):

    if use_ssh:
        with SSHTunnelForwarder(
                ssh_address_or_host=(db_host, 22),
                ssh_password=ssh_password,
                ssh_username=ssh_username,
                remote_bind_address=('127.0.0.1', db_port)
        ) as server:
            conn = sql.connect(host='127.0.0.1',
                               port=server.local_bind_port,
                               user=db_user,
                               passwd=db_password,
                               db=db,
                               charset=charset)
            response = pd.read_sql_query(q, conn)
            conn.close()
            return response
    else:
        conn = sql.connect(host=db_host,
                           port=db_port,
                           user=db_user,
                           passwd=db_password,
                           db=db,
                           charset=charset)
        response = pd.read_sql_query(q, conn)
        conn.close()
        return response

In [12]:
sql_posts_to_enrich = """
SELECT post_id, post_type, body, impact, link, user_name, 
created_at_epoch_ms, client_received_epoch_ms, 
sentiment_ml_model as original_ml_sentiment, 
sentiment_vader_normalized, 
sentiment_mixed as original_sentiment_mixed
FROM automlpredictor_db_dashboard.analysis_posts_sentiment
 WHERE created_at_epoch_ms >=(SELECT UNIX_TIMESTAMP(NOW())*1000-(14*24*3600000)) 
 AND post_type in ('twitter-topic', 'twitter-user');
"""

df_posts_to_enrich = query(
    use_ssh, sql_posts_to_enrich, db_host, db_user, db_password, db_port, db, 
    ssh_username, ssh_password
)


2018-09-18 15:37:37,460| ERROR   | Could not resolve IP address for paperspace_streaming-1, aborting!
2018-09-18 15:37:37,460 - sshtunnel.SSHTunnelForwarder - ERROR - Could not resolve IP address for paperspace_streaming-1, aborting!


BaseSSHTunnelForwarderError: Could not establish session to SSH gateway

In [150]:
df_posts_to_enrich.to_json(
    f'{data_path}/twitter_posts_pre_enrichment_20180918.json.gz', 
    orient='records', lines=True, compression='gzip'
)

In [14]:
df_posts_to_enrich = pd.read_json(
    f'{data_path}/twitter_posts_pre_enrichment_20180918.json.gz', 
    lines=True, orient=str, compression='gzip'
)

In [15]:
# df_posts_to_enrich

In [60]:
df_enriched = pd.merge(
    df_posts_to_enrich, df_bulls_bears, left_on=['user_name'], 
    right_index=True, how='inner', sort=False
)

In [61]:
# df_enriched

In [62]:
re1 = re.compile(r'  +')

def fixup(x):
    x = p.clean(x)
    
    x = x.replace('&#', "#").replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [63]:
df_enriched['clean_body'] = df_enriched['body'].apply(lambda body: fixup(body))

In [54]:
df_enriched=df_enriched.reset_index().rename(index=str, columns={'index':'id'})

In [55]:
df_enriched['repeat']=(df_enriched['impact']/100 + 1).astype(np.int32)

In [67]:
df_enriched.loc[df_enriched['user_name']=='TeslaCharts', 'sentiment'] = 'Bearish'

In [68]:
df_enriched[df_enriched['user_name']=='TeslaCharts']

Unnamed: 0,body,client_received_epoch_ms,created_at_epoch_ms,impact,link,original_ml_sentiment,original_sentiment_mixed,post_id,post_type,sentiment_vader_normalized,user_name,sentiment,id,name,clean_body
24,"RT @xenomorpher1: $TSLA stock price of $280 solidly in Muskian Freak-out zone. So, what do we get? Premarket buying coordinated with inside…",1536951468800,1536282272000,1,https://twitter.com/TeslaCharts/status/1037869190282190849,0.000000,0.100000,1037869190282190849,twitter-topic,0.500000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,"RT : $TSLA stock price of $280 solidly in Muskian Freak-out zone. So, what do we get? Premarket buying coordinated with inside…"
256,"RT @lazygetter: ""only two car companies that didnt go BK "" --10 DRINKS #muskCider $TSLA $TSLAQ",1536951415860,1536301701000,1,https://twitter.com/TeslaCharts/status/1037950680273760256,0.000000,0.100000,1037950680273760256,twitter-topic,0.500000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,"RT : ""only two car companies that didnt go BK "" --10 DRINKS #muskCider $TSLA $TSLAQ"
632,"RT @markbspiegel: Hey, I know one that's about to be available!!!\n\n$TSLAQ https://t.co/wdXSQBBfk5",1536951331566,1536326491000,1,https://twitter.com/TeslaCharts/status/1038054660089765888,1.000000,0.900000,1038054660089765888,twitter-topic,0.500000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,"RT : Hey, I know one that's about to be available!!! $TSLAQ"
770,RT @PlugInFUD: $tsla CAO Morton likely got hired as a fixer as by then they knew things were pretty bad. (you don't leave CFO post at other…,1536951303505,1536329923000,1,https://twitter.com/TeslaCharts/status/1038069054362320896,0.000000,0.100000,1038069054362320896,twitter-topic,0.480000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,RT : $tsla CAO Morton likely got hired as a fixer as by then they knew things were pretty bad. (you don't leave CFO post at other…
854,RT @markbspiegel: Important point of information:\n\nI've just been informed that although the $TSLA CAO may have set a record for shortest t…,1536951283490,1536332220000,1,https://twitter.com/TeslaCharts/status/1038078686900219904,0.010000,0.130000,1038078686900219904,twitter-topic,0.600000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,RT : Important point of information: I've just been informed that although the $TSLA CAO may have set a record for shortest t…
855,RT @cppinvest: It has been said that a picture is worth a thousand words. I agree. \n\n$TSLAQ https://t.co/EC4P6TuW61,1536951283490,1536332229000,1,https://twitter.com/TeslaCharts/status/1038078726397943809,1.000000,0.950000,1038078726397943809,twitter-topic,0.760000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,RT : It has been said that a picture is worth a thousand words. I agree. $TSLAQ
892,"RT @lazygetter: Now that the CAO is gone, we know whos gonna the do the Q3 accounting while on 420 $TSLA $TSLAQ",1536951272430,1536333416000,1,https://twitter.com/TeslaCharts/status/1038083705171050505,0.150000,0.220000,1038083705171050505,twitter-topic,0.500000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,"RT : Now that the CAO is gone, we know whos gonna the do the Q3 accounting while on 420 $TSLA $TSLAQ"
1275,This Week in Enron - Up in Smoke Edition\n\n$TSLA\n$TSLAQ https://t.co/KR4cTRRH6b,1537143528454,1536350570000,1,https://twitter.com/TeslaCharts/status/1038155654568005634,0.970000,0.870000,1038155654568005634,twitter-topic,0.500000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,This Week in Enron - Up in Smoke Edition $TSLA $TSLAQ
1327,Ok now you are just trolling $TSLAQ @SEC_Enforcement https://t.co/EA7Co5cwfP,1537143517354,1536352956000,1,https://twitter.com/TeslaCharts/status/1038165661099458560,0.930000,0.880000,1038165661099458560,twitter-topic,0.650000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,Ok now you are just trolling $TSLAQ
1664,"1/ Dear $TSLAQ - if somebody tries to DM you and offer you information about $TSLA, unless you are 100% sure the ac… https://t.co/LUAy5cAREg",1537251316398,1536415389000,1,https://twitter.com/TeslaCharts/status/1038427523485507584,0.000000,0.160000,1038427523485507584,twitter-topic,0.800000,TeslaCharts,Bearish,962805945020702722,TeslaCharts,"1/ Dear $TSLAQ - if somebody tries to DM you and offer you information about $TSLA, unless you are 100% sure the ac…"


In [59]:
df_enriched

Unnamed: 0,id,body,client_received_epoch_ms,created_at_epoch_ms,impact,link,original_ml_sentiment,original_sentiment_mixed,post_id,post_type,sentiment_vader_normalized,user_name,sentiment,id.1,name,clean_body,repeat
0,0,"RT @iliketeslas: I hope @elonmusk sues the shorts for defamation. We have so many witneses and screenshots, I wouldnt mind seeing some of…",1536951474420,1536277191000,1,https://twitter.com/xandriteme/status/1037847880885628930,1.000000,0.940000,1037847880885628930,twitter-topic,0.720000,xandriteme,Bullish,2837533292,Keith White,"RT : I hope sues the shorts for defamation. We have so many witneses and screenshots, I wouldnt mind seeing some of…",1
1,1525,RT @BarkMSmeagol: @S_Padival @skabooshka You mean like when that dum dum bragged about Tesla only hitting their end of Q2 M3 production goa…,1536949609674,1536375771000,1,https://twitter.com/xandriteme/status/1038261354526720001,0.740000,0.730000,1038261354526720001,twitter-topic,0.680000,xandriteme,Bullish,2837533292,Keith White,RT : You mean like when that dum dum bragged about Tesla only hitting their end of Q2 M3 production goa…,1
2,1778,RT @Alpsoy66: All $tslaq short shorts tweets and interactions adamantly believe that company Q3 update is a lie ! Cars are not produced or…,1536949544736,1536430269000,1,https://twitter.com/xandriteme/status/1038489935550996480,0.950000,0.860000,1038489935550996480,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,RT : All $tslaq short shorts tweets and interactions adamantly believe that company Q3 update is a lie ! Cars are not produced or…,1
3,1785,"RT @TeslaHab: After waiting 2.5 years, it’s finally happening! $TSLA $TSLAQ https://t.co/rQTT4mhcow",1536949544729,1536431309000,1,https://twitter.com/xandriteme/status/1038494296637243392,0.890000,0.810000,1038494296637243392,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,"RT : After waiting 2.5 years, it’s finally happening! $TSLA $TSLAQ",1
4,2496,RT @ellec_uk: $TSLAQ going apeshit because @jack founder and CEO of twitter endorsed @elonmusk interview with @joerogan. Funniest thing I'v…,1536949383611,1536524905000,1,https://twitter.com/xandriteme/status/1038886866232045568,1.000000,0.960000,1038886866232045568,twitter-topic,0.790000,xandriteme,Bullish,2837533292,Keith White,RT : $TSLAQ going apeshit because founder and CEO of twitter endorsed interview with . Funniest thing I'v…,1
5,2733,"RT @ellec_uk: That's it, games over $TSLAQ $TSLA\nTesla Model 3 Becomes #1 Best Selling Car In The US https://t.co/MtRhukJYYn",1536949324232,1536545822000,1,https://twitter.com/xandriteme/status/1038974598266535936,0.910000,0.890000,1038974598266535936,twitter-topic,0.820000,xandriteme,Bullish,2837533292,Keith White,"RT : That's it, games over $TSLAQ $TSLA Tesla Model 3 Becomes #1 Best Selling Car In The US",1
6,2808,"RT @ellec_uk: No matter what $TSLAQ want you to believe, remember \nhttps://t.co/crt8QcyrhE",1536949304810,1536556610000,1,https://twitter.com/xandriteme/status/1039019846271299584,0.860000,0.760000,1039019846271299584,twitter-topic,0.400000,xandriteme,Bullish,2837533292,Keith White,"RT : No matter what $TSLAQ want you to believe, remember",1
7,3367,"RT @TArkesteijn: $TSLAQ in 2014: ""Tesla is aggressively trying to offload model S inventory, no demand!""\n\n$TSLAQ in 2016: ""Tesla is offload…",1536949180989,1536607146000,1,https://twitter.com/xandriteme/status/1039231811178573824,0.000000,0.040000,1039231811178573824,twitter-topic,0.210000,xandriteme,Bullish,2837533292,Keith White,"RT : $TSLAQ in 2014: ""Tesla is aggressively trying to offload model S inventory, no demand!"" $TSLAQ in 2016: ""Tesla is offload…",1
8,3446,RT @28delayslater: Had to update my analysis since this am. Moved from cup to zero gravity pool. @EricSteiman is this correct? \n\n$TSLA $TSL…,1536949164363,1536613467000,1,https://twitter.com/xandriteme/status/1039258324372340737,0.000000,0.100000,1039258324372340737,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,RT : Had to update my analysis since this am. Moved from cup to zero gravity pool. is this correct? $TSLA $TSL…,1
9,3614,RT @BarkMSmeagol: “Teslas respond at the speed of thought”\n- Every Tesla owner ever\n\nhttps://t.co/bzOj6FEZK8\n$TSLA\n☠️💀 $TSLAQ 💀☠️ https://t…,1536949121836,1536625567000,1,https://twitter.com/xandriteme/status/1039309075236839424,0.990000,0.890000,1039309075236839424,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,RT : “Teslas respond at the speed of thought” - Every Tesla owner ever $TSLA ☠️💀 $TSLAQ 💀☠️,1


In [47]:
# df_enriched_repeated.drop_duplicates()

Unnamed: 0,id,body,client_received_epoch_ms,created_at_epoch_ms,impact,link,original_ml_sentiment,original_sentiment_mixed,post_id,post_type,sentiment_vader_normalized,user_name,sentiment,id.1,name,clean_body,repeat
0,0,"RT @iliketeslas: I hope @elonmusk sues the shorts for defamation. We have so many witneses and screenshots, I wouldnt mind seeing some of…",1536951474420,1536277191000,1,https://twitter.com/xandriteme/status/1037847880885628930,1.000000,0.940000,1037847880885628930,twitter-topic,0.720000,xandriteme,Bullish,2837533292,Keith White,"RT : I hope sues the shorts for defamation. We have so many witneses and screenshots, I wouldnt mind seeing some of…",1
1,1525,RT @BarkMSmeagol: @S_Padival @skabooshka You mean like when that dum dum bragged about Tesla only hitting their end of Q2 M3 production goa…,1536949609674,1536375771000,1,https://twitter.com/xandriteme/status/1038261354526720001,0.740000,0.730000,1038261354526720001,twitter-topic,0.680000,xandriteme,Bullish,2837533292,Keith White,RT : You mean like when that dum dum bragged about Tesla only hitting their end of Q2 M3 production goa…,1
2,1778,RT @Alpsoy66: All $tslaq short shorts tweets and interactions adamantly believe that company Q3 update is a lie ! Cars are not produced or…,1536949544736,1536430269000,1,https://twitter.com/xandriteme/status/1038489935550996480,0.950000,0.860000,1038489935550996480,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,RT : All $tslaq short shorts tweets and interactions adamantly believe that company Q3 update is a lie ! Cars are not produced or…,1
3,1785,"RT @TeslaHab: After waiting 2.5 years, it’s finally happening! $TSLA $TSLAQ https://t.co/rQTT4mhcow",1536949544729,1536431309000,1,https://twitter.com/xandriteme/status/1038494296637243392,0.890000,0.810000,1038494296637243392,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,"RT : After waiting 2.5 years, it’s finally happening! $TSLA $TSLAQ",1
4,2496,RT @ellec_uk: $TSLAQ going apeshit because @jack founder and CEO of twitter endorsed @elonmusk interview with @joerogan. Funniest thing I'v…,1536949383611,1536524905000,1,https://twitter.com/xandriteme/status/1038886866232045568,1.000000,0.960000,1038886866232045568,twitter-topic,0.790000,xandriteme,Bullish,2837533292,Keith White,RT : $TSLAQ going apeshit because founder and CEO of twitter endorsed interview with . Funniest thing I'v…,1
5,2733,"RT @ellec_uk: That's it, games over $TSLAQ $TSLA\nTesla Model 3 Becomes #1 Best Selling Car In The US https://t.co/MtRhukJYYn",1536949324232,1536545822000,1,https://twitter.com/xandriteme/status/1038974598266535936,0.910000,0.890000,1038974598266535936,twitter-topic,0.820000,xandriteme,Bullish,2837533292,Keith White,"RT : That's it, games over $TSLAQ $TSLA Tesla Model 3 Becomes #1 Best Selling Car In The US",1
6,2808,"RT @ellec_uk: No matter what $TSLAQ want you to believe, remember \nhttps://t.co/crt8QcyrhE",1536949304810,1536556610000,1,https://twitter.com/xandriteme/status/1039019846271299584,0.860000,0.760000,1039019846271299584,twitter-topic,0.400000,xandriteme,Bullish,2837533292,Keith White,"RT : No matter what $TSLAQ want you to believe, remember",1
7,3367,"RT @TArkesteijn: $TSLAQ in 2014: ""Tesla is aggressively trying to offload model S inventory, no demand!""\n\n$TSLAQ in 2016: ""Tesla is offload…",1536949180989,1536607146000,1,https://twitter.com/xandriteme/status/1039231811178573824,0.000000,0.040000,1039231811178573824,twitter-topic,0.210000,xandriteme,Bullish,2837533292,Keith White,"RT : $TSLAQ in 2014: ""Tesla is aggressively trying to offload model S inventory, no demand!"" $TSLAQ in 2016: ""Tesla is offload…",1
8,3446,RT @28delayslater: Had to update my analysis since this am. Moved from cup to zero gravity pool. @EricSteiman is this correct? \n\n$TSLA $TSL…,1536949164363,1536613467000,1,https://twitter.com/xandriteme/status/1039258324372340737,0.000000,0.100000,1039258324372340737,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,RT : Had to update my analysis since this am. Moved from cup to zero gravity pool. is this correct? $TSLA $TSL…,1
9,3614,RT @BarkMSmeagol: “Teslas respond at the speed of thought”\n- Every Tesla owner ever\n\nhttps://t.co/bzOj6FEZK8\n$TSLA\n☠️💀 $TSLAQ 💀☠️ https://t…,1536949121836,1536625567000,1,https://twitter.com/xandriteme/status/1039309075236839424,0.990000,0.890000,1039309075236839424,twitter-topic,0.500000,xandriteme,Bullish,2837533292,Keith White,RT : “Teslas respond at the speed of thought” - Every Tesla owner ever $TSLA ☠️💀 $TSLAQ 💀☠️,1


In [69]:
df_enriched.drop_duplicates().to_json(
    f'{data_path}/twitter_posts_post_enriched_20180901_20180918.json.gz', 
    orient='records', lines=True, compression='gzip'
)

In [78]:
df_enriched_unknown_user = pd.merge(
    df_posts_to_enrich, df_bulls_bears, left_on=['user_name'], 
    right_index=True, how='left', sort=False
)

In [95]:
df_enriched_unknown_user[
    df_enriched_unknown_user['id'].isnull()
]['user_name'].value_counts().rename_axis('screen_name').reset_index(name='counts').to_json(
    f'{data_path}/twitter_unknown_users_20180901_20180918.json', 
    orient='records', lines=True
)