In [1]:
#import usual packages for data manipulation
import pandas as pd
import numpy as np

#import usual packages for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import yfinance to get stock data
import yfinance as yf

#import quantstats to get some statistics on the stock data
import quantstats as qs

#import statistical packages
import statsmodels.api as sm
import scipy.stats as scs

#import datetime to get the current date
import datetime as dt

import json

#set the style of the plots
plt.style.use('seaborn-v0_8-whitegrid')

In [163]:
df = pd.read_csv('../data/aapl_news_data.csv')
df

Unnamed: 0,date,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,ticker_sentiment
0,1970-01-01 00:00:00.000000000,Consumer Tech News ( Aug 7-Aug 10 ) : Walt Di...,https://www.benzinga.com/news/24/08/40303794/c...,20240811T170019,['Lekha Gupta'],Palantir Technologies Inc. PLTR reported secon...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,News,www.benzinga.com,"[{'topic': 'Manufacturing', 'relevance_score':...",0.077353,Neutral,"[{'ticker': 'MSFT', 'relevance_score': '0.2535..."
1,1970-01-01 00:00:00.000000001,Should Apple Acquire Peloton in 2024? 2 Things...,https://www.fool.com/investing/2024/08/11/appl...,20240811T163000,['Neil Patel'],The tech titan would be Peloton's white knight.,https://g.foolcdn.com/editorial/images/786311/...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.3...",0.224844,Somewhat-Bullish,"[{'ticker': 'AAPL', 'relevance_score': '0.8581..."
2,1970-01-01 00:00:00.000000002,Google Is Ruled a Monopoly. Should Investors D...,https://www.fool.com/investing/2024/08/11/is-a...,20240811T145300,['Geoffrey Seiler'],Why dumping the stock may not be a good idea.,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.5...",0.107870,Neutral,"[{'ticker': 'MSFT', 'relevance_score': '0.3470..."
3,1970-01-01 00:00:00.000000003,Warren Buffett Declares Shift: Selling Apple S...,https://www.benzinga.com/markets/24/08/4030341...,20240811T144519,['LaToya Scott'],"Warren Buffett, arguably America's most succes...",https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'IPO', 'relevance_score': '0.158519...",0.237613,Somewhat-Bullish,"[{'ticker': 'XIACY', 'relevance_score': '0.060..."
4,1970-01-01 00:00:00.000000004,"Arm Holdings Plummets 40% Amid the Sell-Off, I...",https://www.fool.com/investing/2024/08/11/arm-...,20240811T144000,['Bradley Guichard'],Arm has terrific results and an awesome busine...,https://media.ycharts.com/charts/3e52d9a64a560...,Motley Fool,,www.fool.com,"[{'topic': 'IPO', 'relevance_score': '0.158519...",0.244481,Somewhat-Bullish,"[{'ticker': 'SSNLF', 'relevance_score': '0.064..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,1970-01-01 00:00:00.000000678,Apple Poised For Revenue Boost: Analyst Sees A...,https://www.benzinga.com/analyst-ratings/analy...,20240710T184814,['Surbhi Jain'],Apple Inc. AAPL is not just coasting on its ic...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",0.362916,Bullish,"[{'ticker': 'AAPL', 'relevance_score': '0.7394..."
679,1970-01-01 00:00:00.000000679,Samsung Takes On Apple With New Galaxy Ring An...,https://www.benzinga.com/news/24/07/39716525/s...,20240710T184327,['Shivani Kumaresan'],Samsung Electronics Co Ltd SSNLF is stepping u...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,General,www.benzinga.com,"[{'topic': 'Technology', 'relevance_score': '1...",0.271440,Somewhat-Bullish,"[{'ticker': 'SSNLF', 'relevance_score': '0.575..."
680,1970-01-01 00:00:00.000000680,"Microsoft, Apple drop OpenAI board seat plans ...",https://www.business-standard.com/technology/t...,20240710T180659,['Bloomberg'],Microsoft Corp. and Apple Inc. dropped plans t...,https://bsmedia.business-standard.com/_media/b...,Business Standard,GoogleRSS,www.business-standard.com,"[{'topic': 'Technology', 'relevance_score': '0...",0.090407,Neutral,"[{'ticker': 'MSFT', 'relevance_score': '0.4950..."
681,1970-01-01 00:00:00.000000681,Arm Holdings ( ARM ) Hits 52-Week High: What...,https://www.zacks.com/stock/news/2299502/arm-h...,20240710T173200,['Shuvra Shankar Dey'],Given the recent surge in Arm Holdings (ARM) s...,https://staticx-tuner.zacks.com/images/article...,Zacks Commentary,,www.zacks.com,"[{'topic': 'IPO', 'relevance_score': '0.158519...",0.413174,Bullish,"[{'ticker': 'NVDA', 'relevance_score': '0.1460..."


In [164]:
#delete unnecessary columns
df = df[['time_published', 'ticker_sentiment']]
#convert the time_published column to datetime, keeping only the date
df['time_published'] = pd.to_datetime(df['time_published']).dt.date
#decode the ticker_sentiment column using json
df['ticker_sentiment'] = df['ticker_sentiment'].apply(lambda x: json.loads(x.replace("'", '"')))
#group by the time_published column and sum the ticker_sentiment column
df = df.groupby('time_published').sum()
#make a list of scores for our chosen ticker (in this case, 'AAPL')
ticker = 'AAPL'
scores = []
for i in range(len(df)):
    temp = pd.DataFrame(df['ticker_sentiment'].iloc[i])
    temp = temp[temp['ticker'] == ticker]
    wts = temp['relevance_score'].astype(float)
    raw_scores = temp['ticker_sentiment_score'].astype(float)
    scores.append(np.dot(wts,raw_scores) / wts.sum())
df[f'{ticker}_sentiment_score'] = np.array(scores)
df.drop('ticker_sentiment',axis=1,inplace=True)
#set the time_published column as the index
df.set_index('time_published', inplace=True)
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_published'] = pd.to_datetime(df['time_published']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ticker_sentiment'] = df['ticker_sentiment'].apply(lambda x: json.loads(x.replace("'", '"')))


KeyError: "None of ['time_published'] are in the columns"

In [161]:
aapl_scores=[]
for i in range(len(test)):
    temp = pd.DataFrame(test['ticker_sentiment'].iloc[i])
    temp = temp[temp['ticker'] == 'AAPL']
    wts = temp['relevance_score'].astype(float)
    raw_scores = temp['ticker_sentiment_score'].astype(float)
    aapl_scores.append(np.dot(wts,raw_scores) / wts.sum())
test['aapl_scores'] = np.array(aapl_scores)
test.drop('ticker_sentiment',axis=1,inplace=True)
test

Unnamed: 0_level_0,aapl_scores
time_published,Unnamed: 1_level_1
2024-07-10,0.321363
2024-07-11,0.148588
2024-07-12,0.306306
2024-07-13,0.257771
2024-07-14,0.210373
2024-07-15,0.213543
2024-07-16,0.227167
2024-07-17,0.17758
2024-07-18,0.226706
2024-07-19,0.216899


In [116]:
blah = pd.DataFrame(test['ticker_sentiment'].iloc[0])
blah = blah.loc[blah['ticker'] == 'AAPL']
blah['relevance_score'] = blah['relevance_score'].astype(float)
blah['ticker_sentiment_score'] = blah['ticker_sentiment_score'].astype(float)
blah['wtd_sentiment_scored'] = blah['relevance_score'] * blah['ticker_sentiment_score']
print(blah['wtd_sentiment_scored'].sum()/blah['relevance_score'].sum())

0.3213631968896938


In [127]:
list(test['ticker_sentiment'])

[[{'ticker': 'MSFT',
   'relevance_score': '0.118647',
   'ticker_sentiment_score': '0.038845',
   'ticker_sentiment_label': 'Neutral'},
  {'ticker': 'NVDA',
   'relevance_score': '0.345674',
   'ticker_sentiment_score': '0.224338',
   'ticker_sentiment_label': 'Somewhat-Bullish'},
  {'ticker': 'AAPL',
   'relevance_score': '0.234684',
   'ticker_sentiment_score': '0.147779',
   'ticker_sentiment_label': 'Neutral'},
  {'ticker': 'IBKR',
   'relevance_score': '0.118647',
   'ticker_sentiment_score': '-0.044588',
   'ticker_sentiment_label': 'Neutral'},
  {'ticker': 'SBUX',
   'relevance_score': '0.118647',
   'ticker_sentiment_score': '0.19719',
   'ticker_sentiment_label': 'Somewhat-Bullish'},
  {'ticker': 'MSFT',
   'relevance_score': '0.524949',
   'ticker_sentiment_score': '0.352156',
   'ticker_sentiment_label': 'Bullish'},
  {'ticker': 'GOOG',
   'relevance_score': '0.249106',
   'ticker_sentiment_score': '0.144106',
   'ticker_sentiment_label': 'Neutral'},
  {'ticker': 'META',
  

In [129]:
aapl_sent = pd.DataFrame(index=test.index, columns=['sentiment_score'])
for i, d in enumerate(test['ticker_sentiment']):
    temp = pd.DataFrame([t for t in d if t['ticker']=='AAPL'])
    wts = temp['relevance_score'].astype(float)
    raw_scores = temp['ticker_sentiment_score'].astype(float)
    aapl_sent.loc[i,'sentiment_score'] = np.dot(wts, raw_scores)/wts.sum()
aapl_sent

Unnamed: 0_level_0,sentiment_score
time_published,Unnamed: 1_level_1
2024-07-10 00:00:00,
2024-07-11 00:00:00,
2024-07-12 00:00:00,
2024-07-13 00:00:00,
2024-07-14 00:00:00,
...,...
28,0.210054
29,0.095913
30,0.0829
31,0.274147


In [96]:
test2 = 
test2.index = pd.to_datetime(test2.index)
test2


Unnamed: 0_level_0,ticker_sentiment
time_published,Unnamed: 1_level_1
2024-07-10,"[{'ticker': 'MSFT', 'relevance_score': '0.1186..."
2024-07-11,"[{'ticker': 'AAPL', 'relevance_score': '0.8699..."
2024-07-12,"[{'ticker': 'MSFT', 'relevance_score': '0.1434..."
2024-07-13,"[{'ticker': 'MSTR', 'relevance_score': '0.1725..."
2024-07-14,"[{'ticker': 'MSFT', 'relevance_score': '0.4222..."
2024-07-15,"[{'ticker': 'NFLX', 'relevance_score': '0.0243..."
2024-07-16,"[{'ticker': 'AAPL', 'relevance_score': '0.0503..."
2024-07-17,"[{'ticker': 'AAPL', 'relevance_score': '0.6514..."
2024-07-18,"[{'ticker': 'NFLX', 'relevance_score': '0.0227..."
2024-07-19,"[{'ticker': 'YAMCF', 'relevance_score': '0.023..."


In [104]:
for i in range(len(test2)):
    temp = pd.DataFrame(test2['ticker_sentiment'][i])

[{'ticker': 'MSFT',
  'relevance_score': '0.118647',
  'ticker_sentiment_score': '0.038845',
  'ticker_sentiment_label': 'Neutral'},
 {'ticker': 'NVDA',
  'relevance_score': '0.345674',
  'ticker_sentiment_score': '0.224338',
  'ticker_sentiment_label': 'Somewhat-Bullish'},
 {'ticker': 'AAPL',
  'relevance_score': '0.234684',
  'ticker_sentiment_score': '0.147779',
  'ticker_sentiment_label': 'Neutral'},
 {'ticker': 'IBKR',
  'relevance_score': '0.118647',
  'ticker_sentiment_score': '-0.044588',
  'ticker_sentiment_label': 'Neutral'},
 {'ticker': 'SBUX',
  'relevance_score': '0.118647',
  'ticker_sentiment_score': '0.19719',
  'ticker_sentiment_label': 'Somewhat-Bullish'},
 {'ticker': 'MSFT',
  'relevance_score': '0.524949',
  'ticker_sentiment_score': '0.352156',
  'ticker_sentiment_label': 'Bullish'},
 {'ticker': 'GOOG',
  'relevance_score': '0.249106',
  'ticker_sentiment_score': '0.144106',
  'ticker_sentiment_label': 'Neutral'},
 {'ticker': 'META',
  'relevance_score': '0.188193'

In [86]:
test2 = test.groupby('time_published').sum()
for i in range(len(test2)):
    
tickers = ['AAPL', 'AMZN', 'GOOGL', 'MSFT', 'TSLA', 'NVDA', 'META']
# test2['ticker_sentiment'] = test2['ticker_sentiment'].apply(lambda x: pd.DataFrame([i for i in x if i['ticker'].isin(tickers)]))
# test2

In [89]:
test2.index = pd.to_datetime(test2.index)
test2

Unnamed: 0_level_0,ticker_sentiment
time_published,Unnamed: 1_level_1
2024-07-10,"[{'ticker': 'MSFT', 'relevance_score': '0.1186..."
2024-07-11,"[{'ticker': 'AAPL', 'relevance_score': '0.8699..."
2024-07-12,"[{'ticker': 'MSFT', 'relevance_score': '0.1434..."
2024-07-13,"[{'ticker': 'MSTR', 'relevance_score': '0.1725..."
2024-07-14,"[{'ticker': 'MSFT', 'relevance_score': '0.4222..."
2024-07-15,"[{'ticker': 'NFLX', 'relevance_score': '0.0243..."
2024-07-16,"[{'ticker': 'AAPL', 'relevance_score': '0.0503..."
2024-07-17,"[{'ticker': 'AAPL', 'relevance_score': '0.6514..."
2024-07-18,"[{'ticker': 'NFLX', 'relevance_score': '0.0227..."
2024-07-19,"[{'ticker': 'YAMCF', 'relevance_score': '0.023..."


In [93]:
test2['ticker_sentiment'] = test2['ticker_sentiment'].apply(lambda x: pd.DataFrame(x))
test2.iloc[0]

ticker_sentiment       ticker relevance_score ticker_sentiment_sco...
Name: 2024-07-10 00:00:00, dtype: object

In [58]:
blah = test.loc[test['time_published']==dt.date(2024,8,11)]
check = pd.DataFrame(blah['ticker_sentiment'].values.tolist())

Unnamed: 0,time_published,ticker_sentiment
0,2024-08-11,"[{'ticker': 'MSFT', 'relevance_score': '0.2535..."
1,2024-08-11,"[{'ticker': 'AAPL', 'relevance_score': '0.8581..."
2,2024-08-11,"[{'ticker': 'MSFT', 'relevance_score': '0.3470..."
3,2024-08-11,"[{'ticker': 'XIACY', 'relevance_score': '0.060..."
4,2024-08-11,"[{'ticker': 'SSNLF', 'relevance_score': '0.064..."
5,2024-08-11,"[{'ticker': 'MSFT', 'relevance_score': '0.0343..."
6,2024-08-11,"[{'ticker': 'MSFT', 'relevance_score': '0.1204..."
7,2024-08-11,"[{'ticker': 'GOOG', 'relevance_score': '0.2071..."
8,2024-08-11,"[{'ticker': 'AAPL', 'relevance_score': '0.9359..."
9,2024-08-11,"[{'ticker': 'GOOG', 'relevance_score': '0.1144..."


In [64]:
aa = pd.concat

In [62]:
def list_merger(lists):
    return [].extend(i for i in lists)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,"{'ticker': 'MSFT', 'relevance_score': '0.25351...","{'ticker': 'SNEJF', 'relevance_score': '0.1028...","{'ticker': 'GOOG', 'relevance_score': '0.20407...","{'ticker': 'META', 'relevance_score': '0.10288...","{'ticker': 'NVDA', 'relevance_score': '0.10288...","{'ticker': 'AAPL', 'relevance_score': '0.15379...","{'ticker': 'CRWD', 'relevance_score': '0.15379...","{'ticker': 'RDDT', 'relevance_score': '0.10288...","{'ticker': 'BMBL', 'relevance_score': '0.10288...","{'ticker': 'DAL', 'relevance_score': '0.153797...","{'ticker': 'PLTR', 'relevance_score': '0.10288..."
1,"{'ticker': 'AAPL', 'relevance_score': '0.85818...","{'ticker': 'PTON', 'relevance_score': '0.12289...",,,,,,,,,
2,"{'ticker': 'MSFT', 'relevance_score': '0.34705...","{'ticker': 'SSNLF', 'relevance_score': '0.1022...","{'ticker': 'GOOG', 'relevance_score': '0.47938...","{'ticker': 'AAPL', 'relevance_score': '0.20279...",,,,,,,
3,"{'ticker': 'XIACY', 'relevance_score': '0.0608...","{'ticker': 'AAPL', 'relevance_score': '0.45859...","{'ticker': 'BRK-A', 'relevance_score': '0.3530...",,,,,,,,
4,"{'ticker': 'SSNLF', 'relevance_score': '0.0649...","{'ticker': 'NVDA', 'relevance_score': '0.06497...","{'ticker': 'AAPL', 'relevance_score': '0.06497...","{'ticker': 'ARM', 'relevance_score': '0.193207...","{'ticker': 'PLTR', 'relevance_score': '0.12951...","{'ticker': 'TSM', 'relevance_score': '0.129516...",,,,,
5,"{'ticker': 'MSFT', 'relevance_score': '0.03433...","{'ticker': 'TGT', 'relevance_score': '0.481477...","{'ticker': 'AAPL', 'relevance_score': '0.03433...","{'ticker': 'CLX', 'relevance_score': '0.424207...","{'ticker': 'WMT', 'relevance_score': '0.481477...",,,,,,
6,"{'ticker': 'MSFT', 'relevance_score': '0.12043...","{'ticker': 'GOOG', 'relevance_score': '0.29515...","{'ticker': 'META', 'relevance_score': '0.23813...","{'ticker': 'NVDA', 'relevance_score': '0.06038...","{'ticker': 'AAPL', 'relevance_score': '0.06038...","{'ticker': 'TSLA', 'relevance_score': '0.06038...","{'ticker': 'AMZN', 'relevance_score': '0.06038...",,,,
7,"{'ticker': 'GOOG', 'relevance_score': '0.20712...","{'ticker': 'AAPL', 'relevance_score': '0.7877'...","{'ticker': 'BRK-A', 'relevance_score': '0.1561...",,,,,,,,
8,"{'ticker': 'AAPL', 'relevance_score': '0.93595...","{'ticker': 'MS', 'relevance_score': '0.098255'...",,,,,,,,,
9,"{'ticker': 'GOOG', 'relevance_score': '0.11440...","{'ticker': 'META', 'relevance_score': '0.05735...","{'ticker': 'NVDA', 'relevance_score': '0.57127...","{'ticker': 'AAPL', 'relevance_score': '0.22647...",,,,,,,


In [None]:
def merger(x):
    X = pd.concat([pd.DataFrame(t) for t in x)

In [61]:
blah2 = test['ticker_sentiment'].groupby('time_published').apply(lambda x: pd.DataFrame([]+[t for t in x]))

KeyError: 'time_published'

In [46]:
blah = pd.DataFrame(el)
tickers = ['AAPL', 'AMZN', 'GOOGL', 'MSFT', 'TSLA', 'NVDA', 'META']
blah.loc[blah['ticker'].isin(tickers)]

Unnamed: 0,ticker,relevance_score,ticker_sentiment_score,ticker_sentiment_label
0,MSFT,0.253514,0.104171,Neutral
3,META,0.102888,0.165482,Somewhat-Bullish
4,NVDA,0.102888,-0.215333,Somewhat-Bearish
5,AAPL,0.153797,0.25674,Somewhat-Bullish


In [37]:
check = [el[i]['ticker'] for i in range(len(el))]
check

['MSFT',
 'SNEJF',
 'GOOG',
 'META',
 'NVDA',
 'AAPL',
 'CRWD',
 'RDDT',
 'BMBL',
 'DAL',
 'PLTR']

In [34]:
#merge the dicistionaries in el into a single dictionary
el2 = {k: v for d in el for k, v in d.items()}

In [35]:
el2

{'ticker': 'PLTR',
 'relevance_score': '0.102888',
 'ticker_sentiment_score': '-0.047429',
 'ticker_sentiment_label': 'Neutral'}

In [16]:
tickers = ['AAPL', 'AMZN', 'GOOGL', 'MSFT', 'TSLA', 'NVDA', 'META']


str