In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
cnbc = pd.read_csv('FData/Headlines/cnbc_headlines.csv')
guardian = pd.read_csv('FData/Headlines/guardian_headlines.csv')
reuters = pd.read_csv('FData/Headlines/reuters_headlines.csv')
stocker_bot = pd.read_csv('FData/Headlines/stockerbot-export1.csv')

In [3]:
#convert the times for each to datetime format
cnbc['Time'] = pd.to_datetime(cnbc['Time']).dt.normalize()
guardian['Time'] = pd.to_datetime(guardian['Time'], errors = 'coerce').dt.normalize()
reuters['Time'] = pd.to_datetime(reuters['Time'], errors = 'coerce').dt.normalize()



In [4]:
cnbc.head(5)

Unnamed: 0,Headlines,Time,Description
0,Jim Cramer: A better way to invest in the Covi...,2020-07-17,"""Mad Money"" host Jim Cramer recommended buying..."
1,Cramer's lightning round: I would own Teradyne,2020-07-17,"""Mad Money"" host Jim Cramer rings the lightnin..."
2,,NaT,
3,"Cramer's week ahead: Big week for earnings, ev...",2020-07-17,"""We'll pay more for the earnings of the non-Co..."
4,IQ Capital CEO Keith Bliss says tech and healt...,2020-07-17,"Keith Bliss, IQ Capital CEO, joins ""Closing Be..."


In [5]:
def combine_headlines_descriptions(df, source): 
    df = df.dropna(subset = ['Time'])
    headlines = np.array([[i] for i in df.Headlines.values])
    descriptions = np.array([[i] for i in df.Description.values])
    combined = np.concatenate((headlines, descriptions), axis = 1)
    new_combined = []
    for i in combined: 
        new_combined.append(' '.join(i))
    df['Combined'] = new_combined
    df = df.drop(['Headlines', 'Description'], axis = 1)
    df['Source'] = [source for i in range(len(df))]
    return df
    
cnbc = combine_headlines_descriptions(cnbc, source = 'CNBC')
reuters = combine_headlines_descriptions(reuters, source = 'Reuters')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Combined'] = new_combined


In [6]:
guardian['Combined'] = guardian.Headlines.values
guardian = guardian.drop(['Headlines'], axis = 1)
guardian['Source'] = ['Guardian' for i in range(len(guardian))]

In [7]:
combined_df = pd.concat([guardian, cnbc, reuters])

In [8]:
combined_df.head()

Unnamed: 0,Time,Combined,Source
0,2020-07-18,Johnson is asking Santa for a Christmas recovery,Guardian
1,2020-07-18,‘I now fear the worst’: four grim tales of wor...,Guardian
2,2020-07-18,Five key areas Sunak must tackle to serve up e...,Guardian
3,2020-07-18,Covid-19 leaves firms ‘fatally ill-prepared’ f...,Guardian
4,2020-07-18,The Week in Patriarchy \r\n\r\n\r\n Bacardi'...,Guardian


In [9]:
combined_df.to_csv('FData/CombinedHeadlines.csv', index = False)

In [10]:
grouped_df = pd.DataFrame()
unique_dates = combined_df.Time.unique()
grouped_headlines = []
for date in unique_dates: 
    temp_df = combined_df[combined_df.Time == date]
    headlines = temp_df.Combined.values 
    combined_headlines = ' '.join(headlines)
    grouped_headlines.append(combined_headlines)
     
    
grouped_df['Headlines'] = grouped_headlines 
grouped_df['Time'] = unique_dates

In [11]:
grouped_df.sort_values('Time', ascending = True)

Unnamed: 0,Headlines,Time
772,Peter Preston on press and broadcasting \r\n\...,2017-12-17
771,France saves Marquis de Sade’s 120 Days of Sod...,2017-12-18
770,House prices to fall in London and south-east ...,2017-12-19
769,Hedge funds fail to stop 'billion-dollar brain...,2017-12-20
768,Guardian Brexit watch \r\n\r\n\r\n Brexit he...,2017-12-21
...,...,...
3,'Incredible' Boohoo denying knowledge of facto...,2020-07-15
2,Canary Wharf traders and landlord bank on retu...,2020-07-16
1,In search of a new economics for Covid-19 era ...,2020-07-17
0,Johnson is asking Santa for a Christmas recove...,2020-07-18


In [12]:
spy = pd.read_csv('FData/SPYDaily.csv')

In [13]:
spy

Unnamed: 0,1. open,2. high,3. low,4. close,5. volume,Time
0,345.9300,345.9900,343.1300,345.7800,49143931.0,2020-10-23
1,342.9600,345.2400,340.6500,344.6100,55399292.0,2020-10-22
2,343.3300,348.6847,342.4000,342.7300,63574979.0,2020-10-21
3,343.4600,346.8800,342.6400,343.3800,60051880.0,2020-10-20
4,348.6500,349.3300,341.0400,342.0100,68425614.0,2020-10-19
...,...,...,...,...,...,...
5275,138.6250,139.1093,136.7812,137.8750,7431500.0,1999-11-05
5276,136.7500,137.3593,135.7656,136.5312,7907500.0,1999-11-04
5277,136.0000,136.3750,135.1250,135.5000,7222300.0,1999-11-03
5278,135.9687,137.2500,134.5937,134.5937,6516900.0,1999-11-02


In [14]:
spy_seg = spy[(spy.Time >= '2017-12-17') & (spy.Time < '2020-07-18')]

In [15]:
spy_seg

Unnamed: 0,1. open,2. high,3. low,4. close,5. volume,Time
69,321.88,322.57,319.74,321.72,64421802.0,2020-07-17
70,319.79,321.28,319.09,320.79,54433414.0,2020-07-16
71,322.41,323.04,319.27,321.85,86921534.0,2020-07-15
72,313.30,319.76,312.00,318.92,92791839.0,2020-07-14
73,320.13,322.71,314.13,314.84,102549097.0,2020-07-13
...,...,...,...,...,...,...
713,267.60,267.64,266.90,267.51,78720873.0,2017-12-22
714,267.74,268.39,267.30,267.58,67032339.0,2017-12-21
715,268.27,268.33,266.69,267.03,76751500.0,2017-12-20
716,268.48,268.53,267.09,267.17,82382876.0,2017-12-19


In [16]:
spy_seg = spy_seg.set_index(pd.to_datetime(spy_seg.Time)).drop('Time', axis = 1)
grouped_df = grouped_df.set_index(pd.to_datetime(grouped_df.Time)).drop('Time', axis = 1)

In [17]:
spy_grouped = pd.concat([grouped_df, spy_seg], axis = 1, join = 'inner')

In [18]:
spy_grouped.head(5)

Unnamed: 0_level_0,Headlines,1. open,2. high,3. low,4. close,5. volume
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-07-17,In search of a new economics for Covid-19 era ...,321.88,322.57,319.74,321.72,64421802.0
2020-07-16,Canary Wharf traders and landlord bank on retu...,319.79,321.28,319.09,320.79,54433414.0
2020-07-15,'Incredible' Boohoo denying knowledge of facto...,322.41,323.04,319.27,321.85,86921534.0
2020-07-14,British Airways to sell art collection to ease...,313.3,319.76,312.0,318.92,92791839.0
2020-07-13,Offshore wind energy investment quadruples des...,320.13,322.71,314.13,314.84,102549097.0


In [19]:
spy_grouped = spy_grouped.sort_values(by = 'Time')
spy_grouped['CloseDiff'] = spy_grouped['4. close'].diff()
# spy_grouped.dropna(inplace = True)

In [20]:
spy_grouped.head()

Unnamed: 0_level_0,Headlines,1. open,2. high,3. low,4. close,5. volume,CloseDiff
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-12-18,France saves Marquis de Sade’s 120 Days of Sod...,268.1,268.6,267.98,268.2,83653572.0,
2017-12-19,House prices to fall in London and south-east ...,268.48,268.53,267.09,267.17,82382876.0,-1.03
2017-12-20,Hedge funds fail to stop 'billion-dollar brain...,268.27,268.33,266.69,267.03,76751500.0,-0.14
2017-12-21,Guardian Brexit watch \r\n\r\n\r\n Brexit he...,267.74,268.39,267.3,267.58,67032339.0,0.55
2017-12-22,Steelworkers face huge pension cuts as Tata co...,267.6,267.64,266.9,267.51,78720873.0,-0.07


In [21]:
close_diff = []
for diff in spy_grouped.CloseDiff.values[1:]: 
    close_diff.append(diff)
close_diff.append(None)
spy_grouped['CloseDiffNew'] = close_diff

In [22]:
spy_grouped

Unnamed: 0_level_0,Headlines,1. open,2. high,3. low,4. close,5. volume,CloseDiff,CloseDiffNew
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-12-18,France saves Marquis de Sade’s 120 Days of Sod...,268.10,268.60,267.98,268.20,83653572.0,,-1.03
2017-12-19,House prices to fall in London and south-east ...,268.48,268.53,267.09,267.17,82382876.0,-1.03,-0.14
2017-12-20,Hedge funds fail to stop 'billion-dollar brain...,268.27,268.33,266.69,267.03,76751500.0,-0.14,0.55
2017-12-21,Guardian Brexit watch \r\n\r\n\r\n Brexit he...,267.74,268.39,267.30,267.58,67032339.0,0.55,-0.07
2017-12-22,Steelworkers face huge pension cuts as Tata co...,267.60,267.64,266.90,267.51,78720873.0,-0.07,-0.32
...,...,...,...,...,...,...,...,...
2020-07-13,Offshore wind energy investment quadruples des...,320.13,322.71,314.13,314.84,102549097.0,-2.75,4.08
2020-07-14,British Airways to sell art collection to ease...,313.30,319.76,312.00,318.92,92791839.0,4.08,2.93
2020-07-15,'Incredible' Boohoo denying knowledge of facto...,322.41,323.04,319.27,321.85,86921534.0,2.93,-1.06
2020-07-16,Canary Wharf traders and landlord bank on retu...,319.79,321.28,319.09,320.79,54433414.0,-1.06,0.93


In [23]:
def get_target(x): 
    if x < 0: 
        return 0 
    elif x >= 0: 
        return 1
    else: 
        return 0

spy_grouped['Target'] = spy_grouped.CloseDiffNew.map(get_target)

In [24]:
spy_grouped = spy_grouped.dropna(subset = ['CloseDiffNew'])

In [25]:
spy_grouped.head(2)

Unnamed: 0_level_0,Headlines,1. open,2. high,3. low,4. close,5. volume,CloseDiff,CloseDiffNew,Target
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-12-18,France saves Marquis de Sade’s 120 Days of Sod...,268.1,268.6,267.98,268.2,83653572.0,,-1.03,0
2017-12-19,House prices to fall in London and south-east ...,268.48,268.53,267.09,267.17,82382876.0,-1.03,-0.14,0


In [26]:
spy_grouped['DayDiff'] = spy_grouped['4. close'] - spy_grouped['1. open'] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spy_grouped['DayDiff'] = spy_grouped['4. close'] - spy_grouped['1. open']


In [27]:
spy_grouped

Unnamed: 0_level_0,Headlines,1. open,2. high,3. low,4. close,5. volume,CloseDiff,CloseDiffNew,Target,DayDiff
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-12-18,France saves Marquis de Sade’s 120 Days of Sod...,268.10,268.60,267.98,268.20,83653572.0,,-1.03,0,0.10
2017-12-19,House prices to fall in London and south-east ...,268.48,268.53,267.09,267.17,82382876.0,-1.03,-0.14,0,-1.31
2017-12-20,Hedge funds fail to stop 'billion-dollar brain...,268.27,268.33,266.69,267.03,76751500.0,-0.14,0.55,1,-1.24
2017-12-21,Guardian Brexit watch \r\n\r\n\r\n Brexit he...,267.74,268.39,267.30,267.58,67032339.0,0.55,-0.07,0,-0.16
2017-12-22,Steelworkers face huge pension cuts as Tata co...,267.60,267.64,266.90,267.51,78720873.0,-0.07,-0.32,0,-0.09
...,...,...,...,...,...,...,...,...,...,...
2020-07-10,Estate agents in England report surge of inter...,314.31,317.88,312.76,317.59,57454436.0,3.21,-2.75,0,3.28
2020-07-13,Offshore wind energy investment quadruples des...,320.13,322.71,314.13,314.84,102549097.0,-2.75,4.08,1,-5.29
2020-07-14,British Airways to sell art collection to ease...,313.30,319.76,312.00,318.92,92791839.0,4.08,2.93,1,5.62
2020-07-15,'Incredible' Boohoo denying knowledge of facto...,322.41,323.04,319.27,321.85,86921534.0,2.93,-1.06,0,-0.56


In [28]:
spy_grouped = spy_grouped.rename(columns = {'5. volume': 'Volume'})

In [29]:
spy_grouped.to_csv('FData/SPYHeadGrouped.csv')

In [30]:
assert False

AssertionError: 

## Alpha Vantage

In [None]:
from PyFiles import config
from alpha_vantage.timeseries import TimeSeries

In [None]:
api_key = config.api_key
ticker = 'SPY'

In [None]:
ts = TimeSeries(key = api_key, output_format = 'pandas')

data_ts, meta_ts = ts.get_daily(symbol = ticker, outputsize = 'full')

In [None]:
data_ts['Time'] = data_ts.index

In [None]:
data_ts.to_csv('FData/SPYDaily.csv', index = False)