In [1]:
import pandas as pd
import datetime

In [19]:
class feature_engine:
    
    pd = __import__('pandas')
    datetime = __import__('datetime')
    
    def __init__(self):
        print('feature_engine V.0.1 \nImported pandas,datetime packages')
        
    def calc_price_change(self,tweet_time,financial_data,time_col_name,price_col_name,interval,amount,method='both'):
        #remove seconds to match financial data format
        tweet_time=tweet_time.replace(second=0)
        #default baseline is set for five minutes before tweet
        tweet_time_5min_before=tweet_time - self.datetime.timedelta(0,0,0,0,5)
        
        tweet_price=financial_data[financial_data[time_col_name]==tweet_time_5min_before][price_col_name]
        if len(tweet_price)==0:
            return float('NaN')
        tweet_price.reset_index(inplace=True,drop=True)

        def calculation(financial_data,time_col_name,new_time,price_col_name,tweet_price,direction):
            
            other_price=financial_data[financial_data[time_col_name]==new_time][price_col_name]
            other_price.reset_index(inplace=True,drop=True)
            if len(other_price)==0:
                return float('NaN')
            elif direction=='forward':
                return ((other_price.iloc[0]/tweet_price.iloc[0])-1)
            elif direction=='backward':
                return ((tweet_price.iloc[0]/other_price.iloc[0])-1)

        if (method=='both') or (method=='forward'):  
            if interval=='minutes':
                time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,amount)
            elif interval=='hours':
                time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,0,amount)
            elif interval=='days':
                time_forward=tweet_time + self.datetime.timedelta(amount,0,0,0,0)
            pct_change_forward=calculation(financial_data,time_col_name,time_forward,price_col_name,tweet_price,'forward')

        if (method=='both') or (method=='backward'): 
            if interval=='minutes':
                time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,amount)   
            elif interval=='hours':
                time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,0,amount) 
            elif interval=='days':
                time_backward=tweet_time - self.datetime.timedelta(amount,0,0,0,0)
            pct_change_backward=calculation(financial_data,time_col_name,time_backward,price_col_name,tweet_price,'backward')
        
        if method == 'both':
            return self.pd.DataFrame({f'{amount}_{interval}_forward_pct_change':pct_change_forward,
                          f'{amount}_{interval}_backward_pct_change':pct_change_backward},index=[0])
        elif method=='backward':
            return self.pd.DataFrame({f'{amount}_{interval}_backward_pct_change':pct_change_backward},index=[0])
        elif method=='forward':
            return self.pd.DataFrame({f'{amount}_{interval}_forward_pct_change':pct_change_forward},index=[0])
        
    def create_pricechg_columns(self,twitter_data,tweet_time_col_name,financial_data,fin_time_col_name,price_col_name,
                       interval_amount_dict,method='both'):
        from tqdm import tqdm
        new_columns=self.pd.DataFrame()
        for i, row in tqdm(twitter_data.iterrows()):
            temp_df=self.pd.DataFrame()
            for interval, amount in interval_amount_dict:
                new_df=self.calc_price_change(twitter_data[tweet_time_col_name].loc[i],financial_data,fin_time_col_name,
                                                   price_col_name,interval,amount,method)
                if not isinstance(new_df,float):
                    temp_df=self.pd.concat([temp_df,new_df],axis=1)
            for item in temp_df.columns:
                new_columns.at[i,item] = temp_df.loc[0,item]
        return new_columns
    
    def mean_encoding_tocolumn(self,features_df,categorical_column,target_column):
        features_df=features_df[[categorical_column,target_column]]
        grouped=features_df.groupby([categorical_column]).mean()
        features_df=features_df.merge(grouped,on=categorical_column)
        return features_df[f'{target_column}_y']
    
    def mean_encoding_todict(self,features_df,categorical_column,target_column):
        features_df=features_df[[categorical_column,target_column]]
        return features_df.groupby([categorical_column]).mean()
    
    def calculate_sum_volume(self,tweet_time,financial_data,time_col_name,volume_col_name,interval,amount,method='both'):
        #remove seconds to match financial data format
        tweet_time=tweet_time.replace(second=0)
        #default baseline is set for five minutes before tweet
        tweet_time_5min_before=tweet_time - self.datetime.timedelta(0,0,0,0,5)
        
#         tweet_volume=financial_data[financial_data[time_col_name]==tweet_time][volume_col_name]
#         if len(tweet_volume)==0:
#             return float('NaN')
#         tweet_volume.reset_index(inplace=True,drop=True)
        
        def calculation(financial_data,time_col_name,new_time,volume_col_name,tweet_time,direction):
            if direction=='forward':
                other_volume_df=financial_data.loc[(financial_data[time_col_name]<=new_time) & 
                                            (financial_data[time_col_name]>=tweet_time)][volume_col_name]
            elif direction=='backward':
                other_volume_df=financial_data.loc[(financial_data[time_col_name]>=new_time) & 
                                            (financial_data[time_col_name]<=tweet_time)][volume_col_name]
            other_volume=sum(other_volume_df[volume_col_name])
#             other_volume.reset_index(inplace=True,drop=True)
            if len(other_volume)==0:
                return float('NaN')
            else:
                return other_volume

        if (method=='both') or (method=='forward'):  
            if interval=='minutes':
                time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,amount)
            elif interval=='hours':
                time_forward=tweet_time + self.datetime.timedelta(0,0,0,0,0,amount)
            elif interval=='days':
                time_forward=tweet_time + self.datetime.timedelta(amount,0,0,0,0)
            vol_sum_forward=calculation(financial_data,time_col_name,time_forward,volume_col_name,tweet_time_5min_before
                                        ,'forward')

        if (method=='both') or (method=='backward'): 
            if interval=='minutes':
                time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,amount)   
            elif interval=='hours':
                time_backward=tweet_time - self.datetime.timedelta(0,0,0,0,0,amount) 
            elif interval=='days':
                time_backward=tweet_time - self.datetime.timedelta(amount,0,0,0,0)
            vol_sum_backward=calculation(financial_data,time_col_name,time_forward,volume_col_name,tweet_time,'backward')
        
        if method == 'both':
            return self.pd.DataFrame({f'{amount}_{interval}_forward_vol_sum':vol_sum_forward,
                          f'{amount}_{interval}_backward_vol_sum':vol_sum_backward},index=[0])
        elif method=='backward':
            return self.pd.DataFrame({f'{amount}_{interval}_backward_vol_sum':vol_sum_backward},index=[0])
        elif method=='forward':
            return self.pd.DataFrame({f'{amount}_{interval}_forward_vol_sum':vol_sum_forward},index=[0])
        
    def create_volumesum_columns(self,twitter_data,tweet_time_col_name,financial_data,fin_time_col_name,volume_col_name,
                       interval_amount_dict,method='both'):
        from tqdm import tqdm
        new_columns=self.pd.DataFrame()
        for i, row in tqdm(twitter_data.iterrows()):
            temp_df=self.pd.DataFrame()
            for interval, amount in interval_amount_dict:
                new_df=self.calculate_sum_volume(twitter_data[tweet_time_col_name].iloc[i],financial_data,fin_time_col_name,
                                                   volume_col_name,interval,amount)
                if not isinstance(new_df,float):
                    temp_df=self.pd.concat([temp_df,new_df],axis=1)
            for item in temp_df.columns:
                new_columns.at[i,item] = temp_df.loc[0,item]
        return new_columns

In [10]:
financial_data=pd.read_csv('04052009-18122019-SPYDIA-minutedata.csv',parse_dates=['DATETIME'])
twitter_data=pd.read_csv('trump_tweets_reducted_after_nlp.csv',index_col=0,parse_dates=['created_at_utc'])

In [11]:
sp500_data=financial_data[financial_data['SYM_ROOT']=='SPY']
dow_data=financial_data[financial_data['SYM_ROOT']=='DIA']

In [20]:
feat_eng = feature_engine()

feature_engine V.0.1 
Imported pandas,datetime packages


In [24]:
tweets_finance=feat_eng.create_pricechg_columns(twitter_data.tail(10),'created_at_utc',sp500_data,'DATETIME','PRICE',
                       [('minutes',1),('minutes',5),('minutes',10),('minutes',15),('minutes',30),('hours',1),
                        ('hours',3),('hours',6),('days',1),('days',5),('days',10),('days',30)],method='forward')




0it [00:00, ?it/s]




IndexError: single positional indexer is out-of-bounds

In [9]:
twitter_data.join(tweets_finance)

Unnamed: 0,coordinates,created_at_utc,extended_entities,favorite_count,geo,hashtags,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,6_hours_forward_pct_change,6_hours_backward_pct_change,1_days_forward_pct_change,1_days_backward_pct_change,5_days_forward_pct_change,5_days_backward_pct_change,10_days_forward_pct_change,10_days_backward_pct_change,30_days_forward_pct_change,30_days_backward_pct_change
0,,2009-05-04 18:54:25,,850,,,1698308935,,,,...,,0.011775,0.000551,,,,-0.015419,,,
1,,2009-05-05 01:00:10,,282,,,1701461182,,,,...,,,,,,,,,,
2,,2009-05-08 13:38:08,,16,,,1737479987,,,,...,,-0.001346,,0.016548,-0.041461,,-0.021735,,,
3,,2009-05-08 20:40:15,,28,,,1741160716,,,,...,,,,,,,,,,
4,,2009-05-12 14:07:28,,1950,,,1773561338,,,,...,,-0.017951,-0.013442,-0.016215,,-0.007542,-0.005015,,0.061088,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25697,,2019-12-16 05:28:17,,41771,,,1206445922848313344,,,,...,0.004321,,,,,,,,,
25698,,2019-12-16 05:28:18,,26367,,,1206445925843054593,,,,...,0.004321,,,,,,,,,
25699,,2019-12-16 05:28:18,,27105,,,1206445927021662208,,,,...,0.004321,,,,,,,,,
25700,,2019-12-16 05:29:54,,3,,,1206446331675451392,,,,...,0.004306,,,,,0.015011,,,,


In [None]:
tweets_finance.to_csv('tweets_finance.csv',index=True)