In [None]:
# Import event dataset
# add rolling window columns for each of the treated columns and own company drug trials
# create fnc that subsets dataset to events considered clean by criteria

In [14]:
# Combine and Clean CRSP Data
from datetime import datetime
import funcy
import numpy as np
import pandas as pd
import os

from analysis.utilities.dropbox_client import DropboxAPI

DROPBOX_ANALYSIS_DATA = '/intermediate_data'
DROPBOX_CRSP = '/data/CRSP'
DATA = os.path.join(os.path.abspath(os.curdir), 'analysis/Data')
CRSP_FLAG_FNAME = 'crsp_event_flags'
TT_FNAME = 'treating_treated_ticker'
REG_DATA_FMT = 'crsp_event_plus_{0}_minus_{1}.csv'

In [15]:
TREATED_SM_ST = '{}_TREATED_SAMEMKT_SAMETECH'
TREATED_DM_ST = '{}_TREATED_DIFFMKT_SAMETECH'
TREATED_SM_DT = '{}_TREATED_SAMEMKT_DIFFTECH'
NO_TREATED_CE_FMT = '{}_OTHER_TREATED'
NO_OWN_NO_TREATED_CE_FMT = '{}_NO_OWN_NO_OTHER_TREATED'
OWN_FAILS_FMT = '{}_DISC'
OWN_FAILS_CE_FMT = '{}_NO_OWN'
PLUS_MINUS_PERIODS = [(3,3), (3, 5), (5, 5)]

FLAG_RECIPES = {
   NO_TREATED_CE_FMT: [
       TREATED_SM_ST,
       TREATED_DM_ST,
       TREATED_SM_DT
    ],
    NO_OWN_NO_TREATED_CE_FMT: [
        TREATED_SM_ST,
        TREATED_DM_ST,
        TREATED_SM_DT,
        OWN_FAILS_FMT],
    OWN_FAILS_CE_FMT: [OWN_FAILS_FMT]}

In [16]:
PIVOTS = ['PRC', 'VOL', 'RET']
TTYPES = ['TREATING', 'TREATED']
TREATING = 'TREATING'
TREATED = 'TREATED'
TREATING_DT = 'TREATING_DISCONTINUED_IKT_DATE'
TREATED_DT = 'DISCONTINUED_IKT_DATE'
FLAGS = ['SAMEMKT_SAMETECH', 'SAMEMKT_DIFFTECH', 'DIFFMKT_SAMETECH']
CRSP_LONG_FNAME = 'PRC_RET_VOL_long'
SEARCH_ONLINE = False 
UPLOAD_DATA = True 

In [17]:
drop = DropboxAPI()

In [5]:
flag_ds1 = drop.search_and_download(CRSP_FLAG_FNAME, DROPBOX_ANALYSIS_DATA, DATA, 'p', SEARCH_ONLINE)
# drop NAT from index
flag_ds = flag_ds1[~pd.isnull(flag_ds1.index)]
del flag_ds1

Searching local path
Full path /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/crsp_event_flags.p


In [6]:
# fill all non price, volume columns with 0
to_fill = [c for c in flag_ds.columns if c.split('_')[1] not in PIVOTS]

flag_ds[to_fill] = flag_ds.loc[:, to_fill].fillna(value=0)

In [7]:
def count_events_in_window(series_to_count, window_plus, window_minus, counting_fnc=pd.rolling_sum):
    # WINDOW MINUS PERIOD ALWAYS INCLUDES DAY t=0, BUT WINDOW PLUS DOES NOT!!!!
    # NOTE TO AVOID DOUBLE COUNTING SAME DAY TWICE WE NEED TO SHIFT the series plus back one 
    # Count (using counting_fnc) the total events in rolling windows
    # get forward looking rolling sum by reversing series
    series_plus = counting_fnc(series_to_count[::-1], window=window_plus, min_periods=0, center=False)[::-1].shift(-1)
    # get backward looking sum
    series_minus = counting_fnc(series_to_count, window=window_minus, min_periods=0, center=False)
    # sum across series to determine if event clean
    total_events = series_minus.fillna(0).add(series_plus, fill_value=0)
#     print total_events[total_events.index > datetime(2001, 2, 5)]
    return total_events

In [8]:
def create_rolling_sums(flag_ds, tickers, window_plus, window_minus, sumcol_fmts):
# for each of the tickers in the data set, generate a rolling window sum 
# for each of accompanying flags
    sumcol_names = [colfmt.format(t) for t in tickers for colfmt in sumcol_fmts]
    # get names that exist in the dataset
    existing_names = filter(lambda x: x in flag_ds.columns, sumcol_names)
    # dict key is flag name with rolling sum appended, value is series obj
    return {
        '{}_ROLLING_SUM'.format(cname): count_events_in_window(
            flag_ds.loc[:, cname], window_plus, window_minus) for cname in existing_names}

In [9]:
def create_is_clean_series(ticker, flag_ds, flag_series_fmts, sum_series_fmt):
    # ticker is tricker string
    # flag_ds is df with info and rolling values already calculated
    # flag series fmts is formats of rolling sum flags to sum across
    # sum series fmt is name for returned series
    flag_series_colnames = [f.format(ticker)+'_ROLLING_SUM' for f in flag_series_fmts]
    # sum(1) returns a series
    try:
        return (sum_series_fmt.format(ticker), flag_ds.loc[:, flag_series_colnames].sum(1))
    except KeyError as e:
        print('None of {} flags in index'.format(ticker))
        return None
        
    

In [10]:
def create_flagged_event_df(flag_ds, window_plus, window_minus, sum_recipes, tickers=None):
    # create rolling sum dict, append to dataframe
    # sum recipes is a dictionary with key as format of sum column name and value is a list
    # with the formats of all the columns to sum across to generate sum column name
    if not tickers:
        # take all unique from dataframe
        tickers = set(map(lambda x: x.split('_')[0], flag_ds.columns))
    flag_cols_used = set(funcy.flatten(sum_recipes.values()))
    rolling_sum_series_dict = create_rolling_sums(
        flag_ds, tickers, window_plus, window_minus, flag_cols_used)
    roll_df = pd.DataFrame(rolling_sum_series_dict)
    # concat df horizontally
    rolling_df = pd.concat([flag_ds, roll_df], axis=1)
    del roll_df
    # then create is_clean_series for all 
    sum_tups = [create_is_clean_series(
            ticker, rolling_df, flag_col_fmts, sum_series_fmt) for (
                sum_series_fmt, flag_col_fmts) in sum_recipes.iteritems()
                    for ticker in tickers]
    existing_tups = filter(None, sum_tups)
    # append the dataframe again, clean up memory and return final df
    df1 = pd.DataFrame(dict(existing_tups))
    print df1.columns
    rolling_flag_sum_df = pd.concat([rolling_df, df1], axis=1)
    del rolling_df, sum_tups
    # sort columns and reassign
    rolling_flag_sum_df = rolling_flag_sum_df[sorted(rolling_flag_sum_df.columns)]
    return rolling_flag_sum_df

In [11]:
regression_data_sets = []

for (plus, minus) in PLUS_MINUS_PERIODS:
    ds = create_flagged_event_df(flag_ds, plus, minus+1, FLAG_RECIPES)
    regression_data_sets.append(ds)
    if UPLOAD_DATA:
        drop.csv_upload_dataset(
            ds, REG_DATA_FMT.format(plus, minus), DATA, DROPBOX_ANALYSIS_DATA)

	Series.rolling(min_periods=0,window=3,center=False).sum()
	Series.rolling(min_periods=0,window=4,center=False).sum()


None of LIFC flags in index
None of CPD flags in index
None of OLGC flags in index
None of MDRX flags in index
None of AVAN flags in index
None of NPRM flags in index
None of TGEN flags in index
None of NTMD flags in index
None of MIPI flags in index
None of NABI flags in index
None of TTP flags in index
None of IGL flags in index
None of BPAX flags in index
None of ARCP flags in index
None of FO flags in index
None of MDV flags in index
None of MMP flags in index
None of MITI flags in index
None of ERI flags in index
None of NRPH flags in index
None of TKMR flags in index
None of STD flags in index
None of RXII flags in index
None of PCYO flags in index
None of RGDO flags in index
None of NEOT flags in index
None of XOM flags in index
None of TRGT flags in index
None of CFR flags in index
None of NUVO flags in index
None of MBRX flags in index
None of PABK flags in index
None of MRX flags in index
None of VMRX flags in index
None of JCS flags in index
None of TPTX flags in index
None 

	Series.rolling(min_periods=0,window=6,center=False).sum()


None of LIFC flags in index
None of CPD flags in index
None of OLGC flags in index
None of MDRX flags in index
None of AVAN flags in index
None of NPRM flags in index
None of TGEN flags in index
None of NTMD flags in index
None of MIPI flags in index
None of NABI flags in index
None of TTP flags in index
None of IGL flags in index
None of BPAX flags in index
None of ARCP flags in index
None of FO flags in index
None of MDV flags in index
None of MMP flags in index
None of MITI flags in index
None of ERI flags in index
None of NRPH flags in index
None of TKMR flags in index
None of STD flags in index
None of RXII flags in index
None of PCYO flags in index
None of RGDO flags in index
None of NEOT flags in index
None of XOM flags in index
None of TRGT flags in index
None of CFR flags in index
None of NUVO flags in index
None of MBRX flags in index
None of PABK flags in index
None of MRX flags in index
None of VMRX flags in index
None of JCS flags in index
None of TPTX flags in index
None 

	Series.rolling(min_periods=0,window=5,center=False).sum()


None of LIFC flags in index
None of CPD flags in index
None of OLGC flags in index
None of MDRX flags in index
None of AVAN flags in index
None of NPRM flags in index
None of TGEN flags in index
None of NTMD flags in index
None of MIPI flags in index
None of NABI flags in index
None of TTP flags in index
None of IGL flags in index
None of BPAX flags in index
None of ARCP flags in index
None of FO flags in index
None of MDV flags in index
None of MMP flags in index
None of MITI flags in index
None of ERI flags in index
None of NRPH flags in index
None of TKMR flags in index
None of STD flags in index
None of RXII flags in index
None of PCYO flags in index
None of RGDO flags in index
None of NEOT flags in index
None of XOM flags in index
None of TRGT flags in index
None of CFR flags in index
None of NUVO flags in index
None of MBRX flags in index
None of PABK flags in index
None of MRX flags in index
None of VMRX flags in index
None of JCS flags in index
None of TPTX flags in index
None 

In [22]:
# write function to count events with flags

def count_sums_df(flag_df, series_events, series_exclusions=None, threshold=None):
    # creates a boolean mask where all of the rolling sum columns (series excl) are less than threshold
    # take and horizontally
    if series_exclusions is not None and threshold is None:
        print('Must specify threshold level')
        return None
    # check if none series exclusion flags exist
    df_cols = flag_df.columns
    at_least_1_col_exists = True
    if series_exclusions is not None:
        at_least_1_col_exists = any(map(lambda x: x in df_cols, series_exclusions))
    if series_exclusions is None or not at_least_1_col_exists:
        # create all True boolean mask
        mask = np.array([True]*flag_df.shape[0])
    else:
        mask = (flag_df[series_exclusions] < threshold).all(1)
    # index into flag data set using creates mask, and sum over row and across column for all th events columns
    try:
        return flag_df.loc[mask, series_events].sum(0).sum()
    except KeyError:
        print('Events {} do not exist'.format(' '.join(series_events)))
        return 0

def create_ticker_event_exclusion_dict(ticker, event_series_fmts, flags_to_exclude_fmts):
    # create a dictionary with keys as format of event_series to count and 
    # list of flags to exclude flags formatted by each ticker
    # this create a dictionary with key is the ticker, events is a list of event column names to count
    # and list of flag/rolling sum columns to sum up
    tick_events = [event_fmt.format(ticker) for event_fmt in event_series_fmts]
    if flags_to_exclude_fmts is None:
        flag_events = None
    else:
        flag_events = [flag_fmt.format(ticker) for flag_fmt in flags_to_exclude_fmts]
    return {
        'EVENTS': tick_events,
            'EXCLUSIONS': flag_events}


def count_events_with_flags(
    flag_df, tickers, event_series_fmts, flags_to_exclude_fmts, threshold):
    # format event series formats are formats of event series to count
    # flags to exclude are formats of flags to exclude
    
    # create a dictionary with keys as format of event_series to count and 
    # list of flags to exclude flags formatted
    ticker_cols = map(
        lambda x: create_ticker_event_exclusion_dict(x, event_series_fmts, flags_to_exclude_fmts), tickers)
    # creates a list of dictionaries
    # for each of the tickers, try to count the number of events in the 'event' columns, excluding ones where
    # any of the flag columns have an event in the rolling window
    sums_list = map(lambda x: count_sums_df(
            flag_df, x['EVENTS'], x['EXCLUSIONS'], threshold), ticker_cols)
    return sum(sums_list)


In [23]:
# for each of the tickers in the data set, we want to count the number of own event failures without any other
# own fails in window, also own fails without any other treated, and total own fails without any own fails or treated
def create_event_counts_fnc_dict(flag_ds, set_tickers):
    return {
        'TOTAL_COMPANY_FAILURE_EVENTS': {
                'OWN_FAILURES': funcy.func_partial(
                    # total company drug fail events
                    count_events_with_flags, flag_ds, set_tickers, [OWN_FAILS_FMT], None, None),
                'OWN_FAILS_NO_OWN_CONFOUNDING_EVENTS': funcy.func_partial(
                    # total company drug fails excluding events with same company fails in window
                    count_events_with_flags, flag_ds, set_tickers, [OWN_FAILS_FMT], [OWN_FAILS_CE_FMT], 2),
                'OWN_FAILS_NO_OTHER_TREATED_CONFOUNDING_EVENTS': funcy.func_partial(
                    # total company fails excluding events with other treated events(of any flag type) in window
                    count_events_with_flags, flag_ds, set_tickers, [OWN_FAILS_FMT], [NO_TREATED_CE_FMT], 1),
                'OWN_FAILS_NO_OWN_NO_OTHER_TREATED_CONFOUNDING_EVENTS': funcy.func_partial(
                    # total company fails excluding events with other own company fails OR any other treated events of any type in window
                    count_events_with_flags, flag_ds, set_tickers, [OWN_FAILS_FMT], [NO_OWN_NO_TREATED_CE_FMT], 2),
                # Total Counts of treated events
                'TOTAL_TREATED_EVENTS_ALL': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST, TREATED_SM_DT, TREATED_SM_ST], None, None),
                'TOTAL_TREATED_EVENTS_SAMEMKT_DIFFTECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_DT], None, None),
                'TOTAL_TREATED_EVENTS_SAMEMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_ST], None, None),
                'TOTAL_TREATED_EVENTS_DIFFMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST], None, None),
                 # threshold one because counting all events receiving treatement excluding own company fails
                # so if any own company event occurs, treated event should be disqualified
                'TREATED_EXCLUDE_OWN_COMPANY_CE': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST, TREATED_SM_DT, TREATED_SM_ST], [OWN_FAILS_CE_FMT], 1),
                # Counts that exclude all other treated events including target
                'TREATED_EXCLUDE_OTHER_TREATED': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST, TREATED_SM_DT, TREATED_SM_ST], [NO_TREATED_CE_FMT], 2),
                # Counts that exclude all other treated events including target and own company failures
                'TREATED_EVENTS_EXCLUDE_OWN_COMPANY_OTHER_TREATED': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST, TREATED_SM_DT, TREATED_SM_ST], [NO_OWN_NO_TREATED_CE_FMT], 2),
            },
               # treated events of each type excluding own company drug failures 
        'TREATED_EXCLUDE_OWN_COMPANY_CE' :{
              'SAMEMKT_DIFFTECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_DT], [OWN_FAILS_CE_FMT], 1),
            'SAMEMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_ST], [OWN_FAILS_CE_FMT], 1),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST], [OWN_FAILS_CE_FMT], 1),
        },
            # Counts that exclude all other treated events including target
        'TREATED_EXCLUDE_OTHER_TREATED': {
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_DT], [NO_TREATED_CE_FMT], 2),
            'SAMEMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_ST], [NO_TREATED_CE_FMT], 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST], [NO_TREATED_CE_FMT], 2),
        },
        # Treated events excluding all other treated and target company failures
        'TREATED_EVENTS_EXCLUDE_OWN_COMPANY_OTHER_TREATED': {
            'SAMEMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_ST], [NO_OWN_NO_TREATED_CE_FMT], 2),
            'SAMEMKT_DIFFTECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_SM_DT], [NO_OWN_NO_TREATED_CE_FMT], 2),
            'DIFFMKT_SAMETECH': funcy.func_partial(
                    count_events_with_flags, flag_ds, set_tickers, [TREATED_DM_ST], [NO_OWN_NO_TREATED_CE_FMT], 2),
        },
    }
        

In [52]:
def calculate_counts(flag_df, window_plus, window_minus, upload_data=UPLOAD_DATA):
    set_tickers = set(map(lambda x: x.split('_')[0], flag_df.columns))
#     set_tickers = list(set_tickers)[:3]
    counts_dict = create_event_counts_fnc_dict(flag_df, set_tickers)
    counts_series = []
    for category, subsets in counts_dict.iteritems():
        for flag_type, fnc_obj in subsets.iteritems():
            counts_series.append(pd.Series(data=fnc_obj(), name='_'.join([category, flag_type])))
    # although take sum doesn't actually aggregate anything, it pivots the data set to put columns as index 
    counts_ser = pd.concat(counts_series, axis=1)
    upload_ser = pd.Series(index=counts_ser.columns, data=counts_ser.values[0])
    # upload to csv
    if upload_data:
        drop.csv_upload_dataset(
            upload_ser, 'event_statsv2_plus_{0}_minus_{1}.csv'.format(
                window_plus, window_minus), DATA, DROPBOX_ANALYSIS_DATA)
    return upload_ser 
    

In [None]:
for data_ind, (plus, minus) in enumerate(PLUS_MINUS_PERIODS):
    calculate_counts(regression_data_sets[data_ind], plus, minus)


Events LIFC_TREATED_SAMEMKT_DIFFTECH do not exist
Events CPD_TREATED_SAMEMKT_DIFFTECH do not exist
Events OLGC_TREATED_SAMEMKT_DIFFTECH do not exist
Events MDRX_TREATED_SAMEMKT_DIFFTECH do not exist
Events AVAN_TREATED_SAMEMKT_DIFFTECH do not exist
Events NPRM_TREATED_SAMEMKT_DIFFTECH do not exist
Events TGEN_TREATED_SAMEMKT_DIFFTECH do not exist
Events OTIC_TREATED_SAMEMKT_DIFFTECH do not exist
Events NTMD_TREATED_SAMEMKT_DIFFTECH do not exist
Events MIPI_TREATED_SAMEMKT_DIFFTECH do not exist
Events NABI_TREATED_SAMEMKT_DIFFTECH do not exist
Events TTP_TREATED_SAMEMKT_DIFFTECH do not exist
Events IGL_TREATED_SAMEMKT_DIFFTECH do not exist
Events BPAX_TREATED_SAMEMKT_DIFFTECH do not exist
Events ARCP_TREATED_SAMEMKT_DIFFTECH do not exist
Events FO_TREATED_SAMEMKT_DIFFTECH do not exist
Events MDV_TREATED_SAMEMKT_DIFFTECH do not exist
Events MMP_TREATED_SAMEMKT_DIFFTECH do not exist
Events MITI_TREATED_SAMEMKT_DIFFTECH do not exist
Events ERI_TREATED_SAMEMKT_DIFFTECH do not exist
Events N

In [None]:
count_events_with_flags(
    regression_data_sets[0], ['ABBV', 'ZGNX'], ['{}_DISC'], ['{}_NO_OWN'], 2)
# count_events_with_flags(
#     regression_data_sets[0], ['ZGNX'], ['{}_DISC'], ['{}_NO_OWN'], 2)

In [None]:
abbv_cols = [
 'ABBV_DISC',
#  'ABBV_PRC',
#  'ABBV_RET',
#  'ABBV_VOL'
 'ABBV_TREATED_DIFFMKT_SAMETECH',
 'ABBV_TREATED_DIFFMKT_SAMETECH_ROLLING_SUM',
 'ABBV_TREATED_SAMEMKT_DIFFTECH',
 'ABBV_TREATED_SAMEMKT_DIFFTECH_ROLLING_SUM',
 'ABBV_TREATED_SAMEMKT_SAMETECH',
 'ABBV_TREATED_SAMEMKT_SAMETECH_ROLLING_SUM',
 'ABBV_TREATING_DIFFMKT_SAMETECH',
 'ABBV_TREATING_SAMEMKT_DIFFTECH',
 'ABBV_TREATING_SAMEMKT_SAMETECH', 
 'ABBV_NO_OWN', 
'ABBV_NO_OWN_NO_OTHER_TREATED']


In [None]:
regression_data_sets[0].columns[:20]

In [None]:
# df.loc[df['ABBV_TREATED_DIFFMKT_SAMETECH_ROLLING_SUM'] > 0, abbv_cols]
regression_data_sets[0].loc[
    (regression_data_sets[0]['ABBV_NO_OWN'] < 2) & (regression_data_sets[0]['ABBV_DISC'] > 0), abbv_cols]
# regression_data_sets[0].loc[regression_data_sets[0].index > datetime(2008, 9, 25), abbv_cols]