In [None]:
# this notebook imports the final data set with CRSP data matched to events
# and calculates various summary stats

In [5]:
# Combine and Clean CRSP Data
from datetime import datetime
import funcy
import numpy as np
import pandas as pd
import os

from analysis.utilities.dropbox_client import DropboxAPI
from analysis.dev import LR_APP_ACCESS

DROPBOX_ANALYSIS_DATA = '/data'
DROPBOX_CRSP = '/data/CRSP'
DATA = os.path.join(os.path.abspath(os.curdir), 'analysis/Data')
CRSP_FLAG_FNAME = 'crsp_event_flags'
TT_FNAME = 'treating_treated_ticker'

In [6]:
PIVOTS = ['PRC', 'VOL', 'RET']
TTYPES = ['TREATING', 'TREATED']
TREATING = 'TREATING'
TREATED = 'TREATED'
TREATING_DT = 'TREATING_DISCONTINUED_IKT_DATE'
TREATED_DT = 'DISCONTINUED_IKT_DATE'
FLAGS = ['SAMEMKT_SAMETECH', 'SAMEMKT_DIFFTECH', 'DIFFMKT_SAMETECH']
CRSP_LONG_FNAME = 'PRC_RET_VOL_long'
SEARCH_ONLINE = False 

In [7]:
drop = DropboxAPI(LR_APP_ACCESS)

In [8]:
flag_ds1 = drop.search_and_download(CRSP_FLAG_FNAME, DROPBOX_CRSP, DATA, 'p', SEARCH_ONLINE)
# drop NAT from index
flag_ds = flag_ds1[~pd.isnull(flag_ds1.index)]
del flag_ds1

Searching local path
Full path /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/crsp_event_flags.p


In [9]:
# fill all non price, volume columns with 0
to_fill = [c for c in flag_ds.columns if c.split('_')[1] not in PIVOTS]

flag_ds[to_fill] = flag_ds.loc[:, to_fill].fillna(value=0)

In [20]:
def total_competitor_fails_US(flag_ds, tickers, flag):
    treating_counts = []
    for ticker in tickers:
        flag_colname = '{0}_TREATING_{1}'.format(ticker, flag)
        # take sum over columns and then sum over series
        try:
            event_series = flag_ds.loc[:, flag_colname]
            treating_counts.append((ticker, event_series.dropna().sum()))
        except KeyError:
            pass
    return dict(treating_counts)


In [21]:
def count_events_in_window(series_to_count, window_plus, window_minus, counting_fnc=pd.rolling_sum):
    # WINDOW MINUS PERIOD ALWAYS INCLUDES DAY t=0, BUT WINDOW PLUS DOES NOT!!!!
    # NOTE TO AVOID DOUBLE COUNTING SAME DAY TWICE WE NEED TO SHIFT the series plus back one 
    # Count (using counting_fnc) the total events in rolling windows
    # get forward looking rolling sum by reversing series
    series_plus = counting_fnc(series_to_count[::-1], window=window_plus, min_periods=0, center=False)[::-1].shift(-1)
    # get backward looking sum
    series_minus = counting_fnc(series_to_count, window=window_minus, min_periods=0, center=False)
    # sum across series to determine if event clean
    total_events = series_minus.fillna(0).add(series_plus, fill_value=0)
#     print total_events[total_events.index > datetime(2001, 2, 5)]
    return total_events

In [22]:
def count_clean_events(event_col, is_clean_col, threshold):
    # counts the number of clean events in event_col if is_clean_col < threshold
    # if threshold is 1, that means clean events are those with 0 contaminating events
    # create dataframe merged on datetime index
    df_temp = pd.DataFrame({'events': event_col, 'is_clean': is_clean_col})
    total_events = df_temp.loc[(df_temp['is_clean'] < threshold) & (df_temp['events'] > 0), :].shape
    # clean up memory
    del df_temp
    return total_events[0]
    

In [23]:
def add_ser(s1, s2):
    return s1.add(s2, fill_value=0)

def get_flag_counts(event_ser, rolling_window_ser, window_plus, window_minus, threshold):
    is_clean_counts = map(lambda x: count_events_in_window(x, window_plus, window_minus), rolling_window_ser)
    is_clean_ser = reduce(add_ser, is_clean_counts)
    return count_clean_events(event_ser, is_clean_ser, threshold)    
# for each of tickers, calculate 

In [24]:
def no_own_fails(event_df, tickers, ttype, flag, window_plus, window_minus, threshold):
    # no other own company drug failures
    ticker_counts = []
    for ticker in tickers:
        event_name = '{0}_{1}_{2}'.format(ticker, ttype, flag)
        try:
            event_series = event_df.loc[:, event_name]
            fails = event_df.loc[:, '{}_DISC'.format(ticker)]
            count = get_flag_counts(event_series, [fails], window_minus, window_plus, threshold)
            ticker_counts.append((ticker, count))
        except KeyError:
#             print('{} has no events'.format(event_name))
            pass
    # Only zip together tickers that have data
    return dict(ticker_counts)

In [25]:
def no_same_flag_fails(event_df, tickers, ttype, flag, window_plus, window_minus, threshold):
    # if flag is SAMEMKT_DIFFTECH and type is treated,
    # this checks for events clean from confounding events that are TREATED_SAMEMKT_DIFFTECH and TREATING_SAMEMKT_DIFFTECH
    ticker_counts = []
    for ticker in tickers:
        event_name = '{0}_{1}_{2}'.format(ticker, ttype, flag)
        other_event_name = '{0}_{1}_{2}'.format(ticker, filter(lambda x: x != ttype, TTYPES)[0], flag)
        if event_name in event_df.columns:
            event_series = event_df.loc[:, event_name]
            if other_event_name in event_df.columns:
                fails = [event_series, event_df.loc[:, other_event_name]]
            else:
                fails = [event_series]
            count = get_flag_counts(
                event_series, fails, window_minus, window_plus, threshold)
            ticker_counts.append((ticker, count))
        else:
#             print('{} has no events'.format(event_name))
            pass
    # Only zip together tickers that have data
    return dict(ticker_counts)
    

In [30]:
def no_same_type_fails(event_df, tickers, ttype, flag, window_plus, window_minus, threshold):
    # if type is treated, no other news in any treated categories
    ticker_counts = []
    for ticker in tickers:
        event_name = '{0}_{1}_{2}'.format(ticker, ttype, flag)
        if event_name in event_df.columns:
            event_series = event_df.loc[:, event_name]
            fails = []
            for f in FLAGS:
                other_col = '{0}_{1}_{2}'.format(ticker, ttype, f)
                if other_col in event_df.columns:
                    fails.append(event_df.loc[:, other_col])
            count = get_flag_counts(event_series, fails, window_minus, window_plus, threshold)
            ticker_counts.append((ticker, count))
        else:
#             print('{} has no events'.format(event_name))
            pass
    # Only zip together tickers that have data
    return dict(ticker_counts)

In [27]:
def no_other_fails(event_df, tickers, ttype, flag, window_plus, window_minus, threshold):
    # no other news in any of other categories OR own company news
    ticker_counts = []
    for ticker in tickers:
        event_name = '{0}_{1}_{2}'.format(ticker, ttype, flag)
        # check to make sure series exists
        if event_name in event_df.columns:
            event_series = event_df.loc[:, event_name]
            # get other column names that contain the ticker
            other_flags = ['{0}_{1}_{2}'.format(ticker, tt_type, f) for f in FLAGS for tt_type in TTYPES]
            other_flags.append('{0}_DISC'.format(ticker))
            fail_series = []
            for col in other_flags:
                if col in event_df.columns:
                    fail_series.append(event_df.loc[:, col])
            count = get_flag_counts(event_series, fail_series, window_minus, window_plus, threshold)
            ticker_counts.append((ticker, count))
        else:
#             print('{} has no events'.format(event_name))
            pass
    # Only zip together tickers that have data
    return dict(ticker_counts)
    
    

In [40]:
def construct_fnc_dict_obj(flag_ds, window_plus, window_minus):
    set_tickers = set(map(lambda x: x.split('_')[0], flag_ds.columns))
    return {
        'TOTAL_COMPETITOR_FAILURE_EVENTS': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                total_competitor_fails_US, flag_ds, set_tickers, 'SAMEMKT_SAMETECH'),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                total_competitor_fails_US, flag_ds, set_tickers, 'SAMEMKT_DIFFTECH'),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                total_competitor_fails_US, flag_ds, set_tickers, 'DIFFMKT_SAMETECH'),
        },
        'TOTAL_TREATING_EVENTS_NO_OWN_FAILS': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_own_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_SAMETECH', window_plus, window_minus, 2),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_own_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_own_fails, flag_ds, set_tickers, TREATING, 'DIFFMKT_SAMETECH', window_plus, window_minus, 2),
        },
        'TOTAL_TREATED_EVENTS_NO_OWN_FAILS': {
            # Threshold is 1x since cleaning series is 'own' news (ticker_DISC), which are counted as confounding events
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_own_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_SAMETECH', window_plus, window_minus, 1),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_own_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 1),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_own_fails, flag_ds, set_tickers, TREATED, 'DIFFMKT_SAMETECH', window_plus, window_minus, 1),
        },
        'TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_same_flag_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_SAMETECH', window_plus, window_minus, 2),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_same_flag_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_same_flag_fails, flag_ds, set_tickers, TREATING, 'DIFFMKT_SAMETECH', window_plus, window_minus, 2),
        },
        'TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_same_flag_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_SAMETECH', window_plus, window_minus, 2),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_same_flag_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_same_flag_fails, flag_ds, set_tickers, TREATED, 'DIFFMKT_SAMETECH', window_plus, window_minus, 2),
        },
        'TOTAL_TREATING_EVENTS_NO_OTHER_SAME_TYPE': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_same_type_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_SAMETECH', window_plus, window_minus, 2),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_same_type_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_same_type_fails, flag_ds, set_tickers, TREATING, 'DIFFMKT_SAMETECH', window_plus, window_minus, 2),
        },
        'TOTAL_TREATED_EVENTS_NO_OTHER_SAME_TYPE': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_same_type_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_SAMETECH', window_plus, window_minus, 2),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_same_type_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_same_type_fails, flag_ds, set_tickers, TREATED, 'DIFFMKT_SAMETECH', window_plus, window_minus, 2),
        },
        'TOTAL_TREATING_EVENTS_NO_OTHERS_NEWS': {
            # Note threshold has to be 3 because treating event count 1x in treating series and in ticker_DISC series
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_other_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_SAMETECH', window_plus, window_minus, 3),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_other_fails, flag_ds, set_tickers, TREATING, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 3),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_other_fails, flag_ds, set_tickers, TREATING, 'DIFFMKT_SAMETECH', window_plus, window_minus, 3),
        },
        'TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                no_other_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_SAMETECH', window_plus, window_minus, 2),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                no_other_fails, flag_ds, set_tickers, TREATED, 'SAMEMKT_DIFFTECH', window_plus, window_minus, 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                no_other_fails, flag_ds, set_tickers, TREATED, 'DIFFMKT_SAMETECH', window_plus, window_minus, 2),
        },
    }


In [42]:
def calculate_counts(flag_df, window_plus, window_minus):
    counts_dict = construct_fnc_dict_obj(flag_df, window_plus, window_minus)
    counts_series = []
    for category, subsets in counts_dict.iteritems():
        for flag_type, fnc_obj in subsets.iteritems():
            counts_series.append(pd.Series(data=fnc_obj(), name='_'.join([category, flag_type])))
    counts_df = pd.concat(counts_series, axis=1)
    sum_ser = counts_df.sum()
    # upload to csv
    drop.csv_upload_dataset(
        sum_ser, 'event_counts_plus_{0}_minus_{1}.csv'.format(
            window_plus, window_minus), DATA, DROPBOX_ANALYSIS_DATA)
    return sum_ser
    

In [43]:
calculate_counts(flag_ds, 5, 5)

	Series.rolling(min_periods=0,window=5,center=False).sum()
	Series.rolling(min_periods=0,window=5,center=False).sum()


Serializing to local path /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/event_counts_plus_5_minus_5.csv
Uploading /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/event_counts_plus_5_minus_5.csv to Dropbox as /data/event_counts_plus_5_minus_5.csv
UploadError(u'path', UploadWriteFailed(reason=WriteError(u'insufficient_space', None), upload_session_id=u'AAAAAAAABG5rG6CYopLDaQ'))


TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_DIFFTECH     10714.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_SAMETECH       581.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_DIFFMKT_SAMETECH      2198.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_DIFFTECH      179.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_SAMETECH      204.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_DIFFMKT_SAMETECH      275.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_SAMEMKT_DIFFTECH           20499.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_SAMEMKT_SAMETECH             587.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_DIFFMKT_SAMETECH            2776.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_SAMEMKT_DIFFTECH          9926.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_SAMEMKT_SAMETECH             7.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_DIFFMKT_SAMETECH           691.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_TYPE_SAMEMKT_DIFFTECH     10114.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_TYPE_SAMEMKT_SAMETECH         7.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME

In [44]:
calculate_counts(flag_ds, 3, 3)

	Series.rolling(min_periods=0,window=3,center=False).sum()
	Series.rolling(min_periods=0,window=3,center=False).sum()


Serializing to local path /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/event_counts_plus_3_minus_3.csv
Uploading /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/event_counts_plus_3_minus_3.csv to Dropbox as /data/event_counts_plus_3_minus_3.csv
UploadError(u'path', UploadWriteFailed(reason=WriteError(u'insufficient_space', None), upload_session_id=u'AAAAAAAABG8O4h8oJCZOCw'))


TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_DIFFTECH     13995.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_SAMETECH       636.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_DIFFMKT_SAMETECH      2575.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_DIFFTECH      228.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_SAMETECH      216.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_DIFFMKT_SAMETECH      324.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_SAMEMKT_DIFFTECH           21085.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_SAMEMKT_SAMETECH             625.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_DIFFMKT_SAMETECH            2911.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_SAMEMKT_DIFFTECH         12897.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_SAMEMKT_SAMETECH            13.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_DIFFMKT_SAMETECH           875.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_TYPE_SAMEMKT_DIFFTECH     13149.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_TYPE_SAMEMKT_SAMETECH        14.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME

In [45]:
calculate_counts(flag_ds, 3, 5)

	Series.rolling(min_periods=0,window=5,center=False).sum()
	Series.rolling(min_periods=0,window=3,center=False).sum()


Serializing to local path /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/event_counts_plus_3_minus_5.csv
Uploading /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/event_counts_plus_3_minus_5.csv to Dropbox as /data/event_counts_plus_3_minus_5.csv
UploadError(u'path', UploadWriteFailed(reason=WriteError(u'insufficient_space', None), upload_session_id=u'AAAAAAAABHB0LsND9xYZpw'))


TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_DIFFTECH     12092.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_SAMETECH       606.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_FLAG_DIFFMKT_SAMETECH      2388.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_DIFFTECH      195.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_SAMEMKT_SAMETECH      209.0
TOTAL_TREATING_EVENTS_NO_OTHER_SAME_FLAG_DIFFMKT_SAMETECH      296.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_SAMEMKT_DIFFTECH           20808.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_SAMEMKT_SAMETECH             607.0
TOTAL_TREATED_EVENTS_NO_OWN_FAILS_DIFFMKT_SAMETECH            2848.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_SAMEMKT_DIFFTECH         11191.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_SAMEMKT_SAMETECH             9.0
TOTAL_TREATED_EVENTS_NO_OTHERS_NEWS_DIFFMKT_SAMETECH           770.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_TYPE_SAMEMKT_DIFFTECH     11396.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME_TYPE_SAMEMKT_SAMETECH        10.0
TOTAL_TREATED_EVENTS_NO_OTHER_SAME