In [1]:
import pandas as pd
import numpy as np

In [2]:
class feature_engine:
    
    pd = __import__('pandas')
    datetime = __import__('datetime')
    mcal = __import__('pandas_market_calendars')
    np = __import__('numpy')
    random = __import__('random')
    
    def __init__(self):
        print('feature_engine V.0.1 \nImported pandas,datetime,pandas_market_calendars,numpy packages')
        
    def calc_price_change(self,tweet_time,financial_data,time_col_name,price_col_name,interval,amount,method='both',
                          logdiff=True):
        #remove seconds to match financial data format
        tweet_time=tweet_time.replace(second=0)
        #default baseline is set for five minutes before tweet
#         tweet_time_5min_before=tweet_time - self.datetime.timedelta(0,0,0,0,5)
        
        tweet_price=financial_data[financial_data[time_col_name]==tweet_time][price_col_name]
        if len(tweet_price)==0:
            return float('NaN')
        tweet_price.reset_index(inplace=True,drop=True)

        def calculation(financial_data,time_col_name,new_time,price_col_name,tweet_price,direction,logdiff):
            
            other_price=financial_data[financial_data[time_col_name]==new_time][price_col_name]
            other_price.reset_index(inplace=True,drop=True)
            if len(other_price)==0:
                return float('NaN')
            elif logdiff:
                if direction=='forward':
                    return (self.np.log(other_price.iloc[0]) - self.np.log(tweet_price.iloc[0]))
                elif direction=='backward':
                    return (self.np.log(tweet_price.iloc[0]) - self.np.log(other_price.iloc[0]))
            else:
                if direction=='forward':
                    return ((other_price.iloc[0]/tweet_price.iloc[0])-1)
                elif direction=='backward':
                    return ((tweet_price.iloc[0]/other_price.iloc[0])-1)

        if (method=='both') or (method=='forward'):  
            if interval=='minutes':
                time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,amount)
            elif interval=='hours':
                time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,0,amount)
            pct_change_forward=calculation(financial_data,time_col_name,time_forward,price_col_name,tweet_price,'forward',
                                          logdiff)

        if (method=='both') or (method=='backward'): 
            if interval=='minutes':
                time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,amount)   
            elif interval=='hours':
                time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,0,amount) 
            pct_change_backward=calculation(financial_data,time_col_name,time_backward,price_col_name,tweet_price,'backward'
                                           ,logdiff)
        
        if method == 'both':
            return self.pd.DataFrame({f'{amount}_{interval}_forward_pct_change':pct_change_forward,
                          f'{amount}_{interval}_backward_pct_change':pct_change_backward},index=[0])
        elif method=='backward':
            return self.pd.DataFrame({f'{amount}_{interval}_backward_pct_change':pct_change_backward},index=[0])
        elif method=='forward':
            return self.pd.DataFrame({f'{amount}_{interval}_forward_pct_change':pct_change_forward},index=[0])
        
    def create_pricechg_columns(self,twitter_data,tweet_time_col_name,financial_data,fin_time_col_name,price_col_name,
                       interval_amount_dict,method='both',logdiff=True):
        from tqdm import tqdm
        new_columns=self.pd.DataFrame()
        for i, row in tqdm(twitter_data.iterrows()):
            temp_df=self.pd.DataFrame()
            for interval, amount in interval_amount_dict:
                new_df=self.calc_price_change(twitter_data[tweet_time_col_name].loc[i],financial_data,fin_time_col_name,
                                                   price_col_name,interval,amount,method,logdiff)
                if not isinstance(new_df,float):
                    temp_df=self.pd.concat([temp_df,new_df],axis=1)
            for item in temp_df.columns:
                new_columns.at[i,item] = temp_df.loc[0,item]
        return new_columns
    
    def mean_encoding_tocolumn(self,features_df,categorical_column,target_column):
        features_df=features_df[[categorical_column,target_column]]
        grouped=features_df.groupby([categorical_column]).mean()
        features_df=features_df.merge(grouped,on=categorical_column)
        return features_df[f'{target_column}_y']
    
    def mean_encoding_todict(self,features_df,categorical_column,target_column):
        features_df=features_df[[categorical_column,target_column]]
        return features_df.groupby([categorical_column]).mean()
    
    def calculate_sum_volume(self,tweet_time,financial_data,time_col_name,volume_col_name,interval,amount):
        from pytz import timezone
        #remove seconds to match financial data format
        tweet_time=tweet_time.replace(second=0)
        
        stock_exchg = self.mcal.get_calendar('NYSE', open_time=self.datetime.time(5, 30), close_time=self.datetime.time(12, 0))
        daterange = stock_exchg.schedule('2009-05-04','2020-06-10')
        try:
            market_hours = daterange.loc[tweet_time.replace(hour=0, minute=0, second=0)]
        except:
            return self.np.nan
        if len(market_hours)<2:
            return self.np.nan
        
        def calculation(financial_data,time_col_name,time_forward,time_backward,volume_col_name,tweet_time):
            forward_volume_sum=sum(financial_data.loc[(financial_data[time_col_name]<=time_forward) & 
                                        (financial_data[time_col_name]>=tweet_time)][volume_col_name])
            backward_volume_sum=sum(financial_data.loc[(financial_data[time_col_name]<=tweet_time) & 
                                        (financial_data[time_col_name]>=time_backward)][volume_col_name])
            if backward_volume_sum != 0:
                return forward_volume_sum/backward_volume_sum-1
            else:
                return self.np.nan

        if interval=='minutes':
            time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,amount)
            time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,amount)
        elif interval=='hours':
            time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,0,amount)
            time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,0,amount)
            
        if ((time_forward.replace(tzinfo=timezone('UTC'))<=market_hours['market_close']) & 
        (time_backward.replace(tzinfo=timezone('UTC'))>=market_hours['market_open'])):
            vol_sum=calculation(financial_data,time_col_name,time_forward, time_backward, volume_col_name,tweet_time)
        else:
            return self.np.nan

        return self.pd.DataFrame({f'{amount}_{interval}_forward_vol_sum':vol_sum},index=[0])
        
    def create_volumesum_columns(self,twitter_data,tweet_time_col_name,financial_data,fin_time_col_name,volume_col_name,
                       interval_amount_dict):
        from tqdm import tqdm
        new_columns=self.pd.DataFrame()
        for i, row in tqdm(twitter_data.iterrows()):
            temp_df=self.pd.DataFrame()
            for interval, amount in interval_amount_dict:
                new_df=self.calculate_sum_volume(twitter_data[tweet_time_col_name].loc[i],financial_data,fin_time_col_name,
                                                   volume_col_name,interval,amount)
                if not isinstance(new_df,float):
                    temp_df=self.pd.concat([temp_df,new_df],axis=1)
            for item in temp_df.columns:
                new_columns.at[i,item] = temp_df.loc[0,item]
        return new_columns
    
    def fill_missing_fin_data(self,ticker_col_name,stock_exchg_name,start_date,end_date,frequency,fin_time_colname,
                          financial_data,volume_colname,price_colname):
        ticker_name=financial_data.iloc[0][ticker_col_name]
        stock_exchg = self.mcal.get_calendar(stock_exchg_name, open_time=self.datetime.time(5, 30), 
                                             close_time=self.datetime.time(12, 0))
        daterange = stock_exchg.schedule(start_date, end_date)
        dates=self.pd.DataFrame(self.mcal.date_range(daterange, frequency).tz_convert(None),columns=[fin_time_colname])

#         financial_data[fin_time_colname] = self.pd.to_datetime(financial_data[fin_time_colname], utc = True)

        financial_data=dates.merge(financial_data,on=fin_time_colname,how='left')
        financial_data[ticker_col_name]=ticker_name

        financial_data['SYM_SUFFIX'].fillna(0,inplace=True)

        financial_data[volume_colname].fillna(0,inplace=True)
        financial_data[price_colname].interpolate(inplace=True)
        return financial_data
    
    def token_matrix(self,text_column,financial_topic_words):
        
        def check_for_word(token_vector,word):
            if word in token_vector:
                return 1
            else:
                return 0
            
        new_df=self.pd.DataFrame()
        for topic_word in financial_topic_words:
            new_df[topic_word]=text_column.apply(lambda x:check_for_word(x.lower().split(),topic_word))
            
        return new_df
    
    def diff_from_meanlog(self,df,date_colname,numeric_colname):
        #This function returns the difference of logs between the original value and that month's average
        ins_df=df[[date_colname,numeric_colname]].copy().set_index(date_colname,drop=True).astype(int).resample("M").mean()
        ins_df[numeric_colname]=self.np.log(ins_df[numeric_colname].replace(0, np.nan))
        ins_df['month']=ins_df.index.map(str).str[:7]
        df['month']=df[date_colname].map(str).str[:7]
        averages=df[[date_colname,'month']].merge(ins_df,on='month',how='left')[numeric_colname]
        diff=self.np.log(df[numeric_colname].replace(0, np.nan))-averages
        return diff
    
    def create_random_zero_observations(self, twitter_data, time_column, number_of_random_obs):
        max_time = twitter_data[time_column].max()
        min_time = twitter_data[time_column].min()
        delta = max_time - min_time
        int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
        created_at = []
        for i in range(number_of_random_obs):
            random_second = self.random.randrange(int_delta)
            new_time = min_time + self.datetime.timedelta(seconds=random_second)
            created_at.append(new_time)
        new_times_df = pd.DataFrame(created_at)
        for column in twitter_data.columns:
            new_times_df[column] = 0
        new_times_df[time_column] = created_at
        new_times_df.drop(0, axis = 1, inplace = True)
        return new_times_df

In [3]:
financial_data=pd.read_csv('04052009-10062020-SPYDIA-minutedata.csv',parse_dates=['DATETIME'])
twitter_data=pd.read_csv('vp_tweets_reducted_after_nlp.csv',index_col=0,parse_dates=['created_at'])

In [None]:
sp500_data=financial_data[financial_data['SYM_ROOT']=='SPY']
dow_data=financial_data[financial_data['SYM_ROOT']=='DIA']

In [4]:
vix_data = pd.read_csv('all_vix.csv', parse_dates=['DATETIME'])
vix_data.drop(['OPEN', 'HIGH', 'LOW'], axis = 1, inplace = True)
vix_data['TICKER'] = 'VIX'
vix_data['VOLUME'] = 0
vix_data['SYM_SUFFIX'] = 0

In [5]:
feat_eng = feature_engine()

feature_engine V.0.1 
Imported pandas,datetime,pandas_market_calendars,numpy packages


In [6]:
twitter_data['retweet_count']=feat_eng.diff_from_meanlog(twitter_data,'created_at','retweet_count')
twitter_data['favorite_count']=feat_eng.diff_from_meanlog(twitter_data,'created_at','favorite_count')

In [None]:
twitter_data = twitter_data.append(feat_eng.create_random_zero_observations(twitter_data, 'created_at', len(twitter_data)), 
                    ignore_index = True)

In [None]:
sp500_data=feat_eng.fill_missing_fin_data('SYM_ROOT','NYSE','2009-05-04','2020-06-10','1min','DATETIME',
                          sp500_data,'SIZE','PRICE')

In [None]:
dow_data=feat_eng.fill_missing_fin_data('SYM_ROOT','NYSE','2009-05-04','2020-06-10','1min','DATETIME',
                          dow_data,'SIZE','PRICE')

In [None]:
vix_data=feat_eng.fill_missing_fin_data('TICKER','NYSE','2009-05-04','2020-06-10','1min','DATETIME',
                          vix_data,'VOLUME','CLOSE')

## Calculate changes

In [None]:
tweets_finance_volume=feat_eng.create_volumesum_columns(twitter_data[['created_at']],'created_at',
                                                        dow_data[['DATETIME','SIZE']],'DATETIME','SIZE',
                       [('minutes',1),('minutes',5),('minutes',10),('minutes',15),('minutes',30),('hours',1)])

In [None]:
tweets_finance=feat_eng.create_pricechg_columns(twitter_data[['created_at']],'created_at',vix_data,
                                                'DATETIME','CLOSE',[('minutes',1),('minutes',5),('minutes',10),
                                                                    ('minutes',15),('minutes',30),('hours',1),
                                                                    ('hours',3)],
                                                                    method='forward',logdiff = False)

In [None]:
twitter_data_with_finance=twitter_data.join(tweets_finance)

In [None]:
twitter_data_with_finance=twitter_data_with_finance.join(tweets_finance_volume)

In [None]:
# twitter_data_with_finance=twitter_data_with_finance.join(pd.get_dummies(twitter_data_with_finance['source']))

In [None]:
twitter_data=pd.DataFrame()
financial_data=pd.DataFrame()
dow_data=pd.DataFrame()

In [None]:
# twitter_data_with_finance=twitter_data_with_finance.drop(['source'],1)

In [None]:
twitter_data_with_finance.to_csv('vp_twitter_data_with_vix_for_analysis.csv')