In [None]:
# Import event dataset
# add rolling window columns for each of the treated columns and own company drug trials
# create fnc that subsets dataset to events considered clean by criteria

In [2]:
# Combine and Clean CRSP Data
from datetime import datetime
import funcy
import numpy as np
import pandas as pd
import os

from analysis.utilities.dropbox_client import DropboxAPI

DROPBOX_ANALYSIS_DATA = '/intermediate_data'
DROPBOX_CRSP = '/data/CRSP'
DATA = os.path.join(os.path.abspath(os.curdir), 'analysis/Data')
CRSP_FLAG_FNAME = 'crsp_event_flags'
TT_FNAME = 'treating_treated_ticker'

In [3]:
PIVOTS = ['PRC', 'VOL', 'RET']
TTYPES = ['TREATING', 'TREATED']
TREATING = 'TREATING'
TREATED = 'TREATED'
TREATING_DT = 'TREATING_DISCONTINUED_IKT_DATE'
TREATED_DT = 'DISCONTINUED_IKT_DATE'
FLAGS = ['SAMEMKT_SAMETECH', 'SAMEMKT_DIFFTECH', 'DIFFMKT_SAMETECH']
CRSP_LONG_FNAME = 'PRC_RET_VOL_long'
SEARCH_ONLINE = False 

In [4]:
drop = DropboxAPI()

In [5]:
flag_ds1 = drop.search_and_download(CRSP_FLAG_FNAME, DROPBOX_ANALYSIS_DATA, DATA, 'p', SEARCH_ONLINE)
# drop NAT from index
flag_ds = flag_ds1[~pd.isnull(flag_ds1.index)]
del flag_ds1

Searching local path
Full path /Users/lrraymond13/MIT/Kreiger_RA_2016/mkt_reaction_drug_failure/analysis/Data/crsp_event_flags.p


In [6]:
# fill all non price, volume columns with 0
to_fill = [c for c in flag_ds.columns if c.split('_')[1] not in PIVOTS]

flag_ds[to_fill] = flag_ds.loc[:, to_fill].fillna(value=0)

In [68]:
def count_events_in_window(series_to_count, window_plus, window_minus, counting_fnc=pd.rolling_sum):
    # WINDOW MINUS PERIOD ALWAYS INCLUDES DAY t=0, BUT WINDOW PLUS DOES NOT!!!!
    # NOTE TO AVOID DOUBLE COUNTING SAME DAY TWICE WE NEED TO SHIFT the series plus back one 
    # Count (using counting_fnc) the total events in rolling windows
    # get forward looking rolling sum by reversing series
    series_plus = counting_fnc(series_to_count[::-1], window=window_plus, min_periods=0, center=False)[::-1].shift(-1)
    # get backward looking sum
    series_minus = counting_fnc(series_to_count, window=window_minus, min_periods=0, center=False)
    # sum across series to determine if event clean
    total_events = series_minus.fillna(0).add(series_plus, fill_value=0)
#     print total_events[total_events.index > datetime(2001, 2, 5)]
    return total_events

In [69]:
def create_rolling_sums(flag_ds, tickers, window_plus, window_minus, sumcol_fmts):
# for each of the tickers in the data set, generate a rolling window sum 
# for each of accompanying flags
    sumcol_names = [colfmt.format(t) for t in tickers for colfmt in sumcol_fmts]
    print sumcol_names
    # get names that exist in the dataset
    existing_names = filter(lambda x: x in flag_ds.columns, sumcol_names)
    # dict key is flag name with rolling sum appended, value is series obj
    print existing_names
    return {
        '{}_ROLLING_SUM'.format(cname): count_events_in_window(
            flag_ds.loc[:, cname], window_plus, window_minus) for cname in existing_names}

In [82]:
def create_is_clean_series(ticker, flag_ds, flag_series_fmts, sum_series_fmt):
    # ticker is tricker string
    # flag_ds is df with info and rolling values already calculated
    # flag series fmts is formats of rolling sum flags to sum across
    # sum series fmt is name for returned series
    flag_series_colnames = [f.format(ticker)+'_ROLLING_SUM' for f in flag_series_fmts]
    # sum(1) returns a series
    try:
        return (sum_series_fmt.format(ticker), flag_ds.loc[:, flag_series_colnames].sum(1))
    except KeyError as e:
        print('None of {} flags in index'.format(ticker))
        return None
        
    

In [83]:
def create_flagged_event_df(flag_ds, window_plus, window_minus, sum_recipes, tickers=None):
    # create rolling sum dict, append to dataframe
    # sum recipes is a dictionary with key as format of sum column name and value is a list
    # with the formats of all the columns to sum across to generate sum column name
    if not tickers:
        # take all unique from dataframe
        tickers = set(map(lambda x: x.split('_')[0], flag_ds.columns))
    flag_cols_used = set(funcy.flatten(sum_recipes.values()))
    rolling_sum_series_dict = create_rolling_sums(
        flag_ds, tickers, window_plus, window_minus, flag_cols_used)
    print tickers
    print flag_cols_used
    roll_df = pd.DataFrame(rolling_sum_series_dict)
    print roll_df.columns
    print roll_df.head()
    # concat df horizontally
    rolling_df = pd.concat([flag_ds, roll_df], axis=1)
    del roll_df
    # then create is_clean_series for all 
    sum_tups = [create_is_clean_series(
            ticker, rolling_df, flag_col_fmts, sum_series_fmt) for (
                sum_series_fmt, flag_col_fmts) in sum_recipes.iteritems()
                    for ticker in tickers]
    existing_tups = filter(None, sum_tups)
    # append the dataframe again, clean up memory and return final df
    rolling_flag_sum_df = pd.concat([rolling_df, pd.DataFrame(dict(existing_tups))], axis=1)
    del rolling_df, sum_tups
    return rolling_flag_sum_df

In [84]:
df = create_flagged_event_df(flag_ds, 3, 5, {'{}_ALLTREATED': [
            '{}_TREATED_SAMEMKT_SAMETECH', '{}_TREATED_DIFFMKT_SAMETECH', '{}_TREATED_SAMEMKT_DIFFTECH']}, 
                            tickers=['ABBV'])

['ABBV_TREATED_SAMEMKT_SAMETECH', 'ABBV_TREATED_SAMEMKT_DIFFTECH', 'ABBV_TREATED_DIFFMKT_SAMETECH']
['ABBV_TREATED_SAMEMKT_SAMETECH', 'ABBV_TREATED_SAMEMKT_DIFFTECH', 'ABBV_TREATED_DIFFMKT_SAMETECH']
['ABBV']
set(['{}_TREATED_SAMEMKT_SAMETECH', '{}_TREATED_SAMEMKT_DIFFTECH', '{}_TREATED_DIFFMKT_SAMETECH'])
Index([u'ABBV_TREATED_DIFFMKT_SAMETECH_ROLLING_SUM',
       u'ABBV_TREATED_SAMEMKT_DIFFTECH_ROLLING_SUM',
       u'ABBV_TREATED_SAMEMKT_SAMETECH_ROLLING_SUM'],
      dtype='object')
            ABBV_TREATED_DIFFMKT_SAMETECH_ROLLING_SUM  \
1990-01-02                                        0.0   
1990-01-03                                        0.0   
1990-01-04                                        0.0   
1990-01-05                                        0.0   
1990-01-08                                        0.0   

            ABBV_TREATED_SAMEMKT_DIFFTECH_ROLLING_SUM  \
1990-01-02                                        0.0   
1990-01-03                                        0.0

	Series.rolling(min_periods=0,window=3,center=False).sum()
	Series.rolling(min_periods=0,window=5,center=False).sum()


In [85]:
abbv_cols = ['ABBV_ALLTREATED',
 'ABBV_DISC',
 'ABBV_PRC',
 'ABBV_RET',
 'ABBV_TREATED_DIFFMKT_SAMETECH',
 'ABBV_TREATED_DIFFMKT_SAMETECH_ROLLING_SUM',
 'ABBV_TREATED_SAMEMKT_DIFFTECH',
 'ABBV_TREATED_SAMEMKT_DIFFTECH_ROLLING_SUM',
 'ABBV_TREATED_SAMEMKT_SAMETECH',
 'ABBV_TREATED_SAMEMKT_SAMETECH_ROLLING_SUM',
 'ABBV_TREATING_DIFFMKT_SAMETECH',
 'ABBV_TREATING_SAMEMKT_DIFFTECH',
 'ABBV_TREATING_SAMEMKT_SAMETECH',
 'ABBV_VOL']

In [86]:
# df.loc[df['ABBV_TREATED_DIFFMKT_SAMETECH_ROLLING_SUM'] > 0, abbv_cols]
df.loc[df.index > datetime(2013, 2, 20), abbv_cols]

Unnamed: 0,ABBV_ALLTREATED,ABBV_DISC,ABBV_PRC,ABBV_RET,ABBV_TREATED_DIFFMKT_SAMETECH,ABBV_TREATED_DIFFMKT_SAMETECH_ROLLING_SUM,ABBV_TREATED_SAMEMKT_DIFFTECH,ABBV_TREATED_SAMEMKT_DIFFTECH_ROLLING_SUM,ABBV_TREATED_SAMEMKT_SAMETECH,ABBV_TREATED_SAMEMKT_SAMETECH_ROLLING_SUM,ABBV_TREATING_DIFFMKT_SAMETECH,ABBV_TREATING_SAMEMKT_DIFFTECH,ABBV_TREATING_SAMEMKT_SAMETECH,ABBV_VOL
2013-02-21,0.0,0.0,38.78,0.004403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7261100.0
2013-02-22,1.0,0.0,38.46,-0.008252,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4490800.0
2013-02-25,2.0,0.0,37.37,-0.028341,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4665700.0
2013-02-26,3.0,0.0,37.09,-0.007493,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4462900.0
2013-02-27,3.0,0.0,36.73,-0.009706,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,12238500.0
2013-02-28,4.0,0.0,36.92,0.005173,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,10322500.0
2013-03-01,4.0,0.0,37.81,0.024106,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,8929400.0
2013-03-04,4.0,0.0,38.24,0.011373,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,5420700.0
2013-03-05,4.0,0.0,37.51,-0.019090,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,6650300.0
2013-03-06,3.0,0.0,37.74,0.006132,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5489700.0
