In [2]:
import pandas as pd
import datetime
import numpy as np

In [33]:
articles_df = pd.read_csv('apple_article_avgs.csv', index_col = 0)
articles_df.index = pd.to_datetime(articles_df.index)
articles_df_ = articles_df[articles_df.index.day >= 24]
articles_df = articles_df_[articles_df_.index.day < 30]
articles_df = articles_df.between_time('9:30', "15:30").sort_index()

# importing tweet csv files and cleaning them
tweets_df = pd.read_csv('apple_tweets.csv')
tweets_df['time_stamp'] = pd.to_datetime(tweets_df['time_stamp'])
tweets_df['time_stamp'] = tweets_df['time_stamp'].dt.round('30min')
tweets_df_ = tweets_df.set_index(['time_stamp'])

# reading stock.csv and cleaning the df
running_stocks_thirty = pd.read_csv('apple_stock_8_30.csv', index_col = 0)
running_stocks_thirty.index = pd.to_datetime(running_stocks_thirty.index)

# updating stock data
running_stocks_thirty = running_stocks_thirty[running_stocks_thirty.index.day >= 24].sort_index()
running_stocks_thirty['increase_decrease'] = running_stocks_thirty['4. close'] - running_stocks_thirty['1. open']
running_stocks_thirty['up_down'] = np.where(running_stocks_thirty['increase_decrease']>=0, 1, 0)
running_stocks_thirty['movement(%)'] = np.round(((running_stocks_thirty['4. close'] - running_stocks_thirty['1. open'])/running_stocks_thirty['4. close'])*100, 2)

# reading tech indicators
tech_indic = pd.read_csv('apple_tech_indicators.csv', index_col = 0 )
tech_indic.index = pd.to_datetime(tech_indic.index)
tech_indic_ = tech_indic[tech_indic.index.day >= 24]
tech_indic_df = tech_indic_[tech_indic_.index.day < 30]

In [34]:
# drop all the tweets before the 24th
tweets_df = tweets_df_[tweets_df_.index.day >= 24]
tweets_master_df = tweets_df.drop(['t_id', 'text', 'Unnamed: 0'], axis = 1)

In [35]:
tweets_df_24 = tweets_master_df[tweets_master_df.index.day ==24]
tweets_df_25 = tweets_master_df[tweets_master_df.index.day ==25]
tweets_df_26 = tweets_master_df[tweets_master_df.index.day ==26]
tweets_df_27 = tweets_master_df[tweets_master_df.index.day ==27]
tweets_df_28 = tweets_master_df[tweets_master_df.index.day ==28]
tweets_df_29 = tweets_master_df[tweets_master_df.index.day ==29]

In [36]:
# seperate during/after market hrs
def during_after(df):
    df_during = df.between_time('9:30', "15:30")
    df_after = df.between_time('15:30', "9:30")
    return df_during, df_after

In [37]:
open_hours_tweets_24, after_hour_tweets_24 = during_after(tweets_df_24)
open_hours_tweets_25, after_hour_tweets_25 = during_after(tweets_df_25)
open_hours_tweets_26, after_hour_tweets_26 = during_after(tweets_df_26)
open_hours_tweets_27, after_hour_tweets_27 = during_after(tweets_df_27)
open_hours_tweets_28, after_hour_tweets_28 = during_after(tweets_df_28)
open_hours_tweets_29, after_hour_tweets_29 = during_after(tweets_df_29)

In [38]:
# aggregate the scores 
def aggregate(df):
    sent_avg = df.groupby(df.index).agg({'vader_sentiment': 'mean'}).rename(columns = {'vader_sentiment': 'sentiment_avg_30'}).shift()
    sent_count = df.groupby(df.index).agg({'sentiment': 'value_counts'}).unstack().rename(columns = {'sentiment': 'sentiment_30'}).shift()
    df_ = pd.concat([sent_avg, sent_count], axis = 1)
    df_ini = df_.fillna(df_.mean())
    df_hour_i = df_ini.shift().rolling(window=2, center = True, min_periods=2).mean().rename(columns = {'sentiment_avg_30': 'sentiment_avg_60'})
    df_hour = df_hour_i.fillna(df_hour_i.mean())
    df_two_hours_i = df_ini.shift(2).rolling(window=4, center = True, min_periods=4).mean().rename(columns = {'sentiment_avg_30': 'sentiment_avg_120'})
    df_two_hours = df_two_hours_i.fillna(df_two_hours_i.mean())
    df_all = pd.concat([df_ini, df_hour, df_two_hours], axis = 1)
    return df_all

In [39]:
open_hours_tweets_24 = aggregate(open_hours_tweets_24)
open_hours_tweets_25 = aggregate(open_hours_tweets_25)
open_hours_tweets_26 = aggregate(open_hours_tweets_26)
open_hours_tweets_27 = aggregate(open_hours_tweets_27)
open_hours_tweets_28 = aggregate(open_hours_tweets_28)
open_hours_tweets_29 = aggregate(open_hours_tweets_29)

In [40]:
# seperate night-before and morning-day-of market hrs
def night_morning(df):
    df_night = df.between_time('15:30', "0:00")
    df_morning = df.between_time('0:00', "9:30")
    df_night_avg = df_night.groupby(df_night.index.day).mean()
    df_night_ct = df_night.groupby(df_night.index.day).agg({'sentiment': 'value_counts'}).unstack()
    night_df = pd.concat([df_night_avg, df_night_ct], axis = 1)
    night_df.columns = ['vader_sentiment_night', 'sent_neg, night_before', 'sent_neu, night_before', 'sent_pos, night_before']
    df_morning_avg = df_morning.groupby(df_morning.index.day).mean()
    df_morning_ct = df_morning.groupby(df_morning.index.day).agg({'sentiment': 'value_counts'}).unstack()
    morning_df = pd.concat([df_morning_avg, df_morning_ct], axis = 1)
    morning_df.columns = ['vader_sentiment_morning', 'sent_neg, morning', 'sent_neu, morning', 'sent_pos, morning']
    pd.to_datetime(morning_df.index)
    pd.to_datetime(night_df.index)
    return night_df, morning_df

In [42]:
tweets_df_23 = tweets_df_[tweets_df_.index.day == 23]
tweets_23_df = tweets_df_23.drop(['t_id', 'text', 'Unnamed: 0'], axis = 1)
tweets_23_night = tweets_23_df.between_time('15:30', "9:30")
after_hour_tweets_24_night, after_hour_tweets_23_morning = night_morning(tweets_23_night)

In [43]:
after_hour_tweets_25_night, after_hour_tweets_24_morning = night_morning(after_hour_tweets_24)
after_hour_tweets_26_night, after_hour_tweets_25_morning = night_morning(after_hour_tweets_25)
after_hour_tweets_27_night, after_hour_tweets_26_morning = night_morning(after_hour_tweets_26)
after_hour_tweets_28_night, after_hour_tweets_27_morning = night_morning(after_hour_tweets_27)
after_hour_tweets_29_night, after_hour_tweets_28_morning = night_morning(after_hour_tweets_28)
after_hour_tweets_30_night, after_hour_tweets_29_morning = night_morning(after_hour_tweets_29)

In [44]:
# update the index of the night df to the next day
after_hour_tweets_24_night.index = [24]
after_hour_tweets_25_night.index = [25]
after_hour_tweets_26_night.index = [26]
after_hour_tweets_27_night.index = [27]
after_hour_tweets_28_night.index = [28]
after_hour_tweets_29_night.index = [29]

In [45]:
night_morning_24 = pd.concat([after_hour_tweets_24_night, after_hour_tweets_24_morning], axis = 1)
night_morning_25 = pd.concat([after_hour_tweets_25_night, after_hour_tweets_25_morning], axis = 1)
night_morning_26 = pd.concat([after_hour_tweets_26_night, after_hour_tweets_26_morning], axis = 1)
night_morning_27 = pd.concat([after_hour_tweets_27_night, after_hour_tweets_27_morning], axis = 1)
night_morning_28 = pd.concat([after_hour_tweets_28_night, after_hour_tweets_28_morning], axis = 1)
night_morning_29 = pd.concat([after_hour_tweets_29_night, after_hour_tweets_29_morning], axis = 1)

In [46]:
list_of_columns = list(night_morning_24.columns)
def concat_main_night_morning(open_hour_df, night_morning_df):
    for i in range(0,len(list_of_columns)):
        open_hour_df[list_of_columns[i]] = np.full((len(open_hour_df),1), night_morning_df[list_of_columns[i]].values[0])

In [47]:
concat_main_night_morning(open_hours_tweets_24, night_morning_24)
concat_main_night_morning(open_hours_tweets_25, night_morning_25)
concat_main_night_morning(open_hours_tweets_26, night_morning_26)
concat_main_night_morning(open_hours_tweets_27, night_morning_27)
concat_main_night_morning(open_hours_tweets_28, night_morning_28)
concat_main_night_morning(open_hours_tweets_29, night_morning_29)

In [48]:
tweets_master = pd.concat([open_hours_tweets_24, open_hours_tweets_25, open_hours_tweets_26, open_hours_tweets_27, open_hours_tweets_28, open_hours_tweets_29])
tweets_master.columns = ['sentiment_avg_30','coun_neg, 30', 'coun_neu, 30','coun_pos, 30', 'sentiment_avg_60', 'coun_neg, 60', 'coun_neu, 60','coun_pos, 60','sentiment_avg_120', 'coun_neg, 120', 'coun_neu, 120','coun_pos, 120', 'sentiment_night_before_avg', 'count_neg, night_before', 'count_neu, night_before', 'count_pos, night_before','sentiment_morning_avg', 'count_neg, morning', 'count_neu, morning', 'count_pos, morning']
# take out weekend
tweets_master = tweets_master[(tweets_master.index.day != 25) & (tweets_master.index.day != 26)]

In [49]:
# put stock and tweet data together
tweets_stocks = pd.concat([running_stocks_thirty, tech_indic_df, tweets_master, articles_df], axis = 1)

In [51]:
tweets_stocks.to_csv('apple_tweets_stock.csv')