In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
cnbc = pd.read_csv('FData/cnbc_headlines.csv')
guardian = pd.read_csv('FData/guardian_headlines.csv')
reuters = pd.read_csv('FData/reuters_headlines.csv')
stocker_bot = pd.read_csv('FData/stockerbot-export1.csv')

In [None]:
#convert the times for each to datetime format
cnbc['Time'] = pd.to_datetime(cnbc['Time']).dt.normalize()
guardian['Time'] = pd.to_datetime(guardian['Time'], errors = 'coerce').dt.normalize()
reuters['Time'] = pd.to_datetime(reuters['Time'], errors = 'coerce').dt.normalize()

In [None]:
cnbc.head(5)

In [None]:
def combine_headlines_descriptions(df, source): 
    df = df.dropna(subset = ['Time'])
    headlines = np.array([[i] for i in df.Headlines.values])
    descriptions = np.array([[i] for i in df.Description.values])
    combined = np.concatenate((headlines, descriptions), axis = 1)
    new_combined = []
    for i in combined: 
        new_combined.append(' '.join(i))
    df['Combined'] = new_combined
    df = df.drop(['Headlines', 'Description'], axis = 1)
    df['Source'] = [source for i in range(len(df))]
    return df
    
cnbc = combine_headlines_descriptions(cnbc, source = 'CNBC')
reuters = combine_headlines_descriptions(reuters, source = 'Reuters')

In [None]:
guardian['Combined'] = guardian.Headlines.values
guardian = guardian.drop(['Headlines'], axis = 1)
guardian['Source'] = ['Guardian' for i in range(len(guardian))]

In [None]:
combined_df = pd.concat([guardian, cnbc, reuters])

In [None]:
combined_df.head()

In [None]:
combined_df.to_csv('FData/CombinedHeadlines.csv', index = False)

In [None]:
grouped_df = pd.DataFrame()
unique_dates = combined_df.Time.unique()
grouped_headlines = []
for date in unique_dates: 
    temp_df = combined_df[combined_df.Time == date]
    headlines = temp_df.Combined.values 
    combined_headlines = ' '.join(headlines)
    grouped_headlines.append(combined_headlines)
     
    
grouped_df['Headlines'] = grouped_headlines 
grouped_df['Time'] = unique_dates

In [None]:
grouped_df.sort_values('Time', ascending = True)

In [None]:
spy = pd.read_csv('FData/SPYDaily.csv')

In [None]:
spy

In [None]:
spy_seg = spy[(spy.Time >= '2017-12-17') & (spy.Time < '2020-07-18')]

In [None]:
spy_seg

In [None]:
spy_seg = spy_seg.set_index(pd.to_datetime(spy_seg.Time)).drop('Time', axis = 1)
grouped_df = grouped_df.set_index(pd.to_datetime(grouped_df.Time)).drop('Time', axis = 1)

In [None]:
spy_grouped = pd.concat([grouped_df, spy_seg], axis = 1, join = 'inner')

In [None]:
spy_grouped.head(5)

In [None]:
spy_grouped['CloseDiff'] = spy_grouped['4. close'].diff()
spy_grouped = spy_grouped.dropna()

In [None]:
spy_grouped.head()

In [None]:
close_diff = []
for diff in spy_grouped.CloseDiff.values[1:]: 
    close_diff.append(diff)
close_diff.append(None)
spy_grouped['CloseDiffNew'] = close_diff

In [None]:
spy_grouped

In [None]:
def get_target(x): 
    if x < 0: 
        return 0 
    else: 
        return 1 

spy_grouped['Target'] = spy_grouped.OpenDiffNew.map(get_target)

In [None]:
spy_grouped.dropna()

In [None]:
spy_grouped = spy_grouped.rename(columns = {'4. close': 'Close', '5. volume': 'Volume'})

In [None]:
spy_grouped.head(2)

In [None]:
assert False

In [None]:
spy_grouped[['Headlines', 'Close', 'Target']].to_csv('FData/SPYHeadGrouped.csv')

## Alpha Vantage

In [None]:
import config
from alpha_vantage.timeseries import TimeSeries

In [None]:
api_key = config.api_key
ticker = 'SPY'

In [None]:
ts = TimeSeries(key = api_key, output_format = 'pandas')

data_ts, meta_ts = ts.get_daily(symbol = ticker, outputsize = 'full')

In [None]:
data_ts['Time'] = data_ts.index

In [None]:
data_ts.to_csv('FData/SPYDaily.csv', index = False)