In [1]:
%matplotlib inline
import cPickle as pickle
from datetime import datetime
import funcy
import itertools
import numpy as np
import pandas as pd
import plotly
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
import pytz
import os

# Need to update using new folder structure
from analysis.utilities.data_utils import clean_columns, strip_upcase
from analysis.utilities.dropbox_client import DropboxAPI

DROPBOX_ANALYSIS_DATA = '/data/intermediate'
DROPBOX_CRSP = '/data/CRSP'
DATA = os.path.join(os.path.abspath(os.curdir), 'analysis/Data')
TAB_CRSP_FILES = ['output1-50.txt', 'output51-100.txt', 'output101-150.txt']
CSV_CRSP_FILES = ['output_200+.csv']
TT_FNAME = 'treating_treated_ticker'
FULL_CSP_FNAME = 'full_CRSP'
CRSP_LONG_FNAME = 'PRC_RET_VOL_long'
CRSP_FLAG_FNAME = 'crsp_event_flags'

In [2]:
OFFLINE_FLAG = False 
UPLOAD_DATA = False
PRICE_SYMBOL = 'PRC'


In [3]:
TREATING = 'TREATING'
TREATED = 'TREATED'
TREATING_DT = 'TREATING_DISCONTINUED_IKT_DATE'
TREATED_DT = 'DISCONTINUED_IKT_DATE'
FLAGS = ['SAMEMKT_SAMETECH', 'SAMEMKT_DIFFTECH', 'DIFFMKT_SAMETECH']

In [4]:
def get_ticker_events(ticker, flag, ttype, event_df, date_cols=[TREATED_DT, TREATING_DT]):
    # get dates where ticker flagged for 'flag_name' event
    # get instances where treating/treated ticker symbol == to ticker type
    # ticker is ticker symbol, flag is string of 'Treating_
    parent_ts = '{}_PARENT_TICKER_SYMBOL'.format(ttype)
    ts = '{}_TICKER_SYMBOL'.format(ttype)
    events = event_df.loc[(event_df[parent_ts].isin([ticker])) | (event_df[ts].isin([ticker])), :]
    
    # find dates where each specific flag has value for this ticker, if none, continue to next flag
    if events.size == 0:
        return None
    event_dates = events.loc[events[flag] > 0, funcy.concat(date_cols, [flag])]
    return event_dates

In [5]:
if OFFLINE_FLAG:
    from plotly import __version__
    from plotly import offline
#     from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

    print __version__ # requires version >= 1.9.0


    offline.init_notebook_mode(connected=True) # run at the start of every ipython notebook to use plotly.offline
                         # this injects the plotly.js source files into the notebook


In [6]:
def plotly_plot(fig, filename, OFFLINE_FLAG):
    if OFFLINE_FLAG:
        offline.iplot(fig)
    else:
        py.iplot(fig, filename=filename)
    

In [None]:
d = DropboxAPI()

In [None]:
# Download Pivoted CRSP data set
# Need to make this work better - not currently downloading from local if dropbox not available
full_crsp = d.search_and_download(CRSP_LONG_FNAME, DROPBOX_CRSP, DATA, 'p')

In [None]:
full_crsp.columns

In [None]:
# Download treated/treating dataset
tt_ds = d.search_and_download(TT_FNAME, DROPBOX_CRSP, DATA, 'p')

In [None]:
def to_unix_time(dt):
    # convert datetime object to milliseconds 
    epoch =  datetime.utcfromtimestamp(0)
    return (dt - epoch).total_seconds() * 1000

In [None]:
def create_layout(ticker, x_date_range=None):
    if x_date_range is not None:
#          # date range dates must be in unix timestamp format UTC
        assert len(x_date_range) == 2
        assert isinstance(datetime, x_date_range[0])
        assert isinstance(datetime, x_date_range[-1])
        x_date_range = map(to_unix_time, DEFAULT_RANGE)
    layout = go.Layout(
        title = ticker,
        height = 850, 
        width = 900,
        xaxis = dict(
            range=x_date_range,
            type='date',
            title = 'Date',
            showgrid = True
        ),
        yaxis = dict(
            title= 'Price',
            showline = True,
            side='left',
        ), 
    )
    return layout

In [None]:
def create_event_traces(ticker, event_df, pivot_df, price_symbol, ttype='TREATING', dt_var=TREATING_DT):
    # ticker is string of ticker val, event_df is long list of treating/treated flags and event type flags
    # pivot df is df with index of time, cols of different tickers and values are ticker stock prices
    # ttype is treating/treated, tt_dt is correspondinf treating/treated_dt value
    traces = []

    # get stock average to fill in when flag date outside range of stock data
    # mean returns a series, so need to get integer value to fill in
    ticker_name = '{}_{}'.format(ticker, price_symbol)
    stock_avg = pivot_df.loc[:, [ticker_name]].mean(skipna=True).values[0]
    
    # find dates where each specific flag has value for this ticker, if none, continue to next flag
    for num, f in enumerate(FLAGS):
        stock_prices = None
        ticker_event_df = get_ticker_events(ticker, f, ttype, event_df)
        if ticker_event_df is not None and not ticker_event_df.empty:
            # get dates where flag has a value
            # not when getting values, need to make sure a pandas tslib.Timestamp, not np datetime64
            event_dates = pd.DatetimeIndex(ticker_event_df[dt_var].unique())
            print('{0} {1} {2} has {3} events'.format(ticker, ttype, f, ticker_event_df.size))
            # get stock average to fill in when flag date outside range of stock data
            try:
                stock_prices = pivot_df.loc[event_dates, ticker_name]+num*0.25
            except KeyError as e:
                stock_prices = pd.Series(stock_avg, index=event_dates)
                print('Ticker {} has no overlap with stock data and events'.format(ticker))
            t = plotly.graph_objs.Scatter(
                        x = event_dates,
                        y = stock_prices,
                        name = '{0}_{1}'.format(ttype, f),
                        xaxis = 'Date',
                        showlegend = True,
                        mode='markers',
                        marker = {
                            'symbol': 'line-ew-open',
                            'size': 20}
            )
            traces.append(t)
        else:
            print('{0} {1} {2} has no events'.format(ticker, ttype, f))
            
    return traces
    
    

In [None]:
def create_discontinued_traces(ticker, event_df, pivot_df, price_symbol):
    # ticker is string of ticker val, event_df is long list of treating/treated flags and event type flags
    # pivot df is df with index of time, cols of different tickers and values are ticker stock prices
    # Function creates scatter objects for date when company discontinues/fails in trials for its own drug even though
    # when it is part of a treating cohort
    traces = []

    # get stock average to fill in when flag date outside range of stock data
    # mean returns a series, so need to get integer value to fill in
    ticker_name = '{}_{}'.format(ticker, price_symbol)
    stock_avg = pivot_df.loc[:, [ticker_name]].mean(skipna=True).values[0]
    
    # get dates where this ticker has date of discontinuation for treating drug (we don't want to double plot these)
    treating_dates = event_df.loc[(event_df['TREATING_TICKER_SYMBOL'] == ticker) | 
                                  (event_df['TREATING_PARENT_TICKER_SYMBOL'] == ticker), TREATING_DT].dropna().unique()
    for num, f in enumerate(FLAGS):
        stock_prices = None
        ticker_event_df = get_ticker_events(ticker, f, TREATED, event_df)
        if ticker_event_df is not None and not ticker_event_df.empty:
            # get dates where flag has a value
            # not when getting values, need to make sure a pandas tslib.Timestamp, not np datetime64
            # filter out dates plotted as treated dates
            event_dates = pd.DatetimeIndex(ticker_event_df.loc[
                    ~ticker_event_df.loc[:, TREATED_DT].isin(treating_dates), TREATED_DT].dropna().unique())
            print('{0} {1} {2} has {3} additional drug fails'.format(ticker, TREATED, f, event_dates.size))
            # get stock average to fill in when flag date outside range of stock data
            try:
                stock_prices = pivot_df.loc[event_dates, ticker_name]+num*0.25
            except KeyError as e:
                stock_prices = pd.Series(stock_avg, index=event_dates)
                print('Ticker {} has no overlap with stock data and events'.format(ticker))
            t = plotly.graph_objs.Scatter(
                        x = event_dates,
                        y = stock_prices,
                        name = 'NEWS_{0}'.format(f),
                        xaxis = 'Date',
                        showlegend = True,
                        mode='markers',
                        marker = {
                            'symbol': 'circle',
                            'size': 30}
            )
            traces.append(t)
        else:
            print('{0} {1} has no additional drug failures events'.format(ticker, f))
            
    return traces

In [None]:
def create_price_traces(tickers, pivot_df, value_symbol):
    if not funcy.is_iter(tickers):
        tickers = [tickers]
    traces = []
    for ticker in tickers:
        ticker_name = '{}_{}'.format(ticker, value_symbol)
        price_series = pivot_df[ticker_name].dropna()
        t = plotly.graph_objs.Scatter(
            x = price_series.index,
            y = price_series.values,
            name = '{}'.format(ticker_name),
            showlegend = True,
            mode = 'line',
            line = {
                'width': 1 
            }
        )
        traces.append(t)
    return traces
    

In [None]:
def create_treating_treated_graph(ticker, df, pivot_df, price_symbol, offline_flag):
    layout = create_layout(ticker, None)
    graph_fncs = [
        funcy.rpartial(create_event_traces, df, pivot_df, price_symbol, TREATING, TREATING_DT), 
        # note we still get treating date for plotting treated series- this is date the treatment occured 
        # so more informative
        # TODO - also plot dates when/if same drug as being treated was discontinued
        funcy.rpartial(create_event_traces, df, pivot_df, price_symbol, TREATED, TREATING_DT),
        # create additional drug disc failures 
        funcy.rpartial(create_discontinued_traces, df, pivot_df, price_symbol),
        funcy.rpartial(create_price_traces, pivot_df, price_symbol)
    ]
    # force function evaluation
    graph_results = [fn(ticker) for fn in graph_fncs]
    # remove lists of length 0 (flags with no data), expand lists returned
    graph_data = [item for sublist in graph_results for item in sublist if len(sublist) > 0]
    has_data = len(graph_data) > 0
    if not has_data:
        print('Ticker {} has no data'.format(ticker))
        return ticker
    print('Plotting Ticker {}'.format(ticker))
    fig = go.Figure(data=graph_data,
                 layout=layout,
    )
    plotly_plot(fig, ticker, offline_flag)
    return None
    


In [None]:
create_treating_treated_graph('ZIOP', tt_ds, full_crsp, PRICE_SYMBOL, OFFLINE_FLAG)

In [None]:
treated_tickers = set(itertools.chain(
    tt_ds.TREATED_PARENT_TICKER_SYMBOL.dropna().unique(), 
    tt_ds.loc[tt_ds.TREATED_PARENT_COMPANY_ID.isnull(), 'TREATED_TICKER_SYMBOL'].dropna().unique()))

treating_tickers = set(itertools.chain(
    tt_ds.TREATING_PARENT_TICKER_SYMBOL.dropna().unique(), 
    tt_ds.loc[tt_ds.TREATING_PARENT_COMPANY_ID.isnull(), 'TREATING_TICKER_SYMBOL'].dropna().unique()))


In [None]:
# Check that both treating and treated tickers have values
col_tickers = map(lambda x: x.split('_')[0], full_crsp.columns)
treating_exists = [x for x in treating_tickers if x in col_tickers]
treated_exists =  [x for x in treated_tickers if x in col_tickers]
# print treating_exists
# print treated_exists
existing_unique_tickers = set(funcy.merge(treated_exists, treating_exists))
print(len(existing_unique_tickers))

In [None]:
def get_missing_tickers(missing_tickers, ttype=TREATED):
    ts = '{}_TICKER_SYMBOL'.format(ttype)
    parent_ts = '{}_PARENT_TICKER_SYMBOL'.format(ttype)
    t_name = '{}_CLEAN_NAME'.format(ttype)
    parent_t_name = '{}_PARENT_CLEAN_NAME'.format(ttype)
    missing_df = tt_ds.loc[(tt_ds[parent_ts].isin(missing_tickers)) | (tt_ds[ts].isin(missing_tickers)), 
        [ts, parent_ts, t_name, parent_t_name]]
    return missing_df.drop_duplicates().rename(columns={
            ts: 'TICKER_SYMBOL', parent_ts: 'PARENT_TICKER_SYMBOL',
            t_name: 'CLEAN_NAME', parent_t_name: 'PARENT_CLEAN_NAME'})
    

# get tickers not in CRSP dataframe
missing_treating = set(treating_tickers) - existing_unique_tickers
missing_treated = set(treated_tickers) - existing_unique_tickers
# get company names to recover ticker symbol
missing_treated_df = get_missing_tickers(missing_treated, TREATED)
missing_treating_df = get_missing_tickers(missing_treating, TREATING)


missing_tickers = pd.concat([missing_treating_df, missing_treated_df], ignore_index=True).drop_duplicates()

In [None]:
# Write Out Data Set
if UPLOAD_DATA:
    d.csv_upload_dataset(missing_tickers, 'unknown_ticker_symbols.csv', DATA, DROPBOX_ANALYSIS_DATA)


In [None]:
# Dispatch plotting requests
sorted_t = sorted(existing_unique_tickers)
# returns a list of list with each list 30 tickers long - max api use is 30 graphs at a time
chunks = funcy.chunks(30, sorted_t)

In [None]:
for chunk in chunks:
    map(funcy.rpartial(create_treating_treated_graph, tt_ds, full_crsp, PRICE_SYMBOL, OFFLINE_FLAG), chunk)

In [None]:
OFFLINE_FLAG = False
for chunk in chunks[1:]:
    map(funcy.rpartial(create_treating_treated_graph, tt_ds, pivot_ds, OFFLINE_FLAG), chunk)

In [None]:
print chunks

In [None]:
pivot_ds.loc[:, 'ACU'].dropna()

In [None]:
print treating_tickers

In [None]:
tt_ds.head()