In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re
import nltk
import datetime
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')
stemmer = SnowballStemmer("english")
stops = set(stopwords.words("english"))

# from google.colab import files
# uploaded = files.upload()

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




In [2]:
basedir = '/content/drive/My Drive/Colab Notebooks/Numeraxial/'
os.chdir(basedir)
# filename = 'LM_dict.csv' # LM dictionary
# # landm = pd.read_csv(os.path.join(basedir, filename),header=0, index_col='Word')
# landm = pd.read_csv(filename,header=0, index_col='Word')

# filename = 'affectivespace.csv'
# aff = pd.read_csv(filename,header=None, index_col=0)
# aff.columns = pd.Index(np.arange(0,100))
# aff.index.name = 'Word'

In [3]:
path = os.getcwd() 
dir_list = os.listdir(path) 
  
print("Files and directories in '", path, "' :")  
print(dir_list)

idx_pickle=[]
idx_csv=[]
list_all = os.listdir(os.path.join(path,'data'))
for num, lst in enumerate(list_all):
  if re.findall(r".pickle", lst) != []:
    idx_pickle.append(num)
  elif re.findall(r".csv", lst) != []:
    idx_csv.append(num)

pickle_set = []
csv_set = []
for i in idx_pickle:
  pickle_set.append(list_all[i])
for i in idx_csv:
  csv_set.append(list_all[i])

pickle_set.sort()
csv_set.sort()
print(pickle_set)
print(csv_set)
n = len(pickle_set)

Files and directories in ' /content/drive/My Drive/Colab Notebooks/Numeraxial ' :
['affectivespace.csv', 'data', 'LM_dict.csv', 'data_cleaned', 'Practice.ipynb', 'Model.ipynb', 'Data_gen.ipynb', 'Data_generator.ipynb']
['news_AAPL_dict.pickle', 'news_AXP_dict.pickle', 'news_BA_dict.pickle', 'news_CAT_dict.pickle', 'news_CSCO_dict.pickle', 'news_CVX_dict.pickle', 'news_DIS_dict.pickle', 'news_GE_dict.pickle', 'news_GS_dict.pickle', 'news_HD_dict.pickle', 'news_IBM_dict.pickle', 'news_INTC_dict.pickle', 'news_JNJ_dict.pickle', 'news_JPM_dict.pickle', 'news_KO_dict.pickle', 'news_MCD_dict.pickle', 'news_MMM_dict.pickle', 'news_MRK_dict.pickle', 'news_MSFT_dict.pickle', 'news_NKE_dict.pickle', 'news_PFE_dict.pickle', 'news_PG_dict.pickle', 'news_TRV_dict.pickle', 'news_UNH_dict.pickle', 'news_UTX_dict.pickle', 'news_VZ_dict.pickle', 'news_V_dict.pickle', 'news_WMT_dict.pickle', 'news_XOM_dict.pickle']
['AAPL_20170102_20200529.csv', 'AXP_20170102_20200529.csv', 'BA_20170102_20200529.csv', '

In [4]:
def get_pickle_df(filename_pkl):
  # filename_pkl = pickle_set[1]
  with open(os.path.join('data', filename_pkl),'rb') as handle:
    b=pickle.load(handle)

  tkr=[]
  pub_date=[]
  content=[]

  for k, v in b.items():
    tkr.append(v['TICKER'])
    pub_date.append(v['PUBLICATION_DATE'])
    content.append(v['SUMMARY'])

  df_pickle = pd.DataFrame({'Ticker':tkr,'Date':pub_date,'Summary':content})
  df_pickle['Date'] = pd.to_datetime(df_pickle['Date'], format = '%Y-%m-%d %H:%M:%S%z').dt.tz_convert('US/Eastern')
  return df_pickle

In [5]:
def sent_analysis(df):
  results = []
  neg = []
  neu = []
  pos = []
  compound = []
  for headline in df['Summary']:
      pol_score = SIA().polarity_scores(str(headline)) # run analysis
      results.append(pol_score)
  for result in results:
    neg.append(result['neg'])
    neu.append(result['neu'])
    pos.append(result['pos'])
    compound.append(result['compound'])
    
  df['Negative'] = neg
  df['Neutral'] = neu
  df['Positive'] = pos
  df['Compound'] = compound

  return df

In [6]:
def set_markettime(df):
  mkt_time = (df['Date'].dt.time >= datetime.time(9,30,00)) & (df['Date'].dt.time <= datetime.time(16,00,00))
  df['MktT'] = 'Close'
  df.loc[mkt_time, 'MktT'] = 'Open'
  # tk = df[0]['TICKER']

  opendt = []
  embedded = []
  void_arr = []
  for y in range(df.shape[0]):
    if df.loc[y,'MktT'] == 'Close':
      void_arr.append(np.array(df.loc[y, ['Negative','Neutral','Positive','Compound']]))
    else:
      opendt.append(df.loc[y, 'Date'])
      void_arr.append(np.array(df.loc[y, ['Negative','Neutral','Positive','Compound']]))
      embedded.append(np.sum(void_arr, axis = 0))
      void_arr = []
  # new_df = pd.DataFrame({'Ticker': tk, 'Date': opendt})
  new_df = pd.DataFrame({'Date': opendt})
  new_df = pd.concat([new_df, pd.DataFrame(embedded)], axis=1)
  new_df.rename(columns={0:'Negative', 1:'Neutral', 2:'Positive', 3:'Compound'}, inplace=True)
  new_df.set_index('Date', inplace=True)
  # new_df.drop(['Ticker'], axis=1, inplace=True)
  return new_df

In [7]:
def get_full_df(filename_csv, df_mkt):
  # filename_csv = csv_set[1]
  price_df = pd.read_csv(os.path.join('data', filename_csv), usecols=range(7))
  price_df['Date'] = pd.to_datetime(price_df['Date'], format='%d-%b-%Y %H:%M').dt.tz_localize('US/Eastern')
  price_df.set_index('Date',inplace=True)

  times=[]
  for ix in price_df.index:
    if ix in df_mkt.index:
      times.append(ix)

  ful = pd.concat([price_df, df_mkt], axis=1).loc[times[0]:times[-1]].fillna(method='ffill')
  return ful

In [8]:
for numb in range(n):
  full_df = pd.DataFrame()
  pkl = pickle_set[numb]
  csv = csv_set[numb]
  df_pickle = get_pickle_df(pkl)
  tk = df_pickle['Ticker'][0]
  df_pickle = sent_analysis(df_pickle)
  df_pickle['Date'] = pd.to_datetime(df_pickle['Date'], format = '%Y-%m-%d %H:%M:%S%z').dt.tz_convert('US/Eastern')
  df_pickle.drop(['Ticker','Summary'], axis=1, inplace=True)
  df_pickle = df_pickle.set_index('Date').resample('15min', label='right', closed='right').sum().dropna(axis = 'index', how = 'all').reset_index()
  # df_pickle['Ticker'] = b[0]['TICKER']
  # Reoder columns
  cols = df_pickle.columns.to_list()[-1:] + df_pickle.columns.to_list()[:-1]
  df_pickle = df_pickle[cols]
  df_pickle_mkt = set_markettime(df_pickle)
  full_df = get_full_df(csv, df_pickle_mkt)

  print('full_df is complete!')
  full_df.to_pickle('data_cleaned/' + tk + '.pkl')
  print('Uploaded '+ tk + ' data!')
  print('==================================================')

full_df is complete!
Uploaded AAPL data!
full_df is complete!
Uploaded AXP data!
full_df is complete!
Uploaded BA data!
full_df is complete!
Uploaded CAT data!
full_df is complete!
Uploaded CSCO data!
full_df is complete!
Uploaded CVX data!
full_df is complete!
Uploaded DIS data!
full_df is complete!
Uploaded GE data!
full_df is complete!
Uploaded GS data!
full_df is complete!
Uploaded HD data!
full_df is complete!
Uploaded IBM data!
full_df is complete!
Uploaded INTC data!
full_df is complete!
Uploaded JNJ data!
full_df is complete!
Uploaded JPM data!
full_df is complete!
Uploaded KO data!
full_df is complete!
Uploaded MCD data!
full_df is complete!
Uploaded MMM data!
full_df is complete!
Uploaded MRK data!
full_df is complete!
Uploaded MSFT data!
full_df is complete!
Uploaded NKE data!
full_df is complete!
Uploaded PFE data!
full_df is complete!
Uploaded PG data!
full_df is complete!
Uploaded TRV data!
full_df is complete!
Uploaded UNH data!
full_df is complete!
Uploaded UTX data!
fu