# Import Libraries

In [52]:
import os,sys,re,glob,ipykernel,tweepy,stockmarket,nltk,collections,itertools,pandas as pd,numpy as np,\
        seaborn as sns, yfinance as yf, matplotlib.pyplot as plt, statsmodels.formula.api as smf,\
        statsmodels.api as sm, autoreload, importlib
from pathlib import Path
from string import punctuation 
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
np.random.seed(0)
pd.set_option('display.max_columns', None)

## Set Working Directory: 
    
* /Sentiment_Analysis 
    
* __ file __ isn't available in jupyter notebooks

    

In [53]:
file = os.getcwd().split(os.sep)
while(file[-1] != 'Sentiment_Analysis'): # Check the working directory
    os.chdir('..')
    file = os.getcwd().split(os.sep)
    sys.path.append(os.path.abspath(os.getcwd()))
print(f"root directory: {os.getcwd()}", sep = '\n')

root directory: c:\Code\Public_Github\Sentiment_Analysis


## Load Custom Functions

In [54]:
from src import user_download_helper, user_download, merge_files, merge_all, \
                strip_all_words, sentence_word_probability, download_todays_test, \
                format_model,linear_model, naive_bayes,create_target

# Twitter API Credentials

In [55]:
# Read in keys from a csv file
autentication_path = os.path.abspath('../Sentiment_Analysis/Stock_Market/authentication/authentication_tokens.csv')
readin_authentication = pd.read_csv(autentication_path, header=0, sep=',')

consumer_key = readin_authentication['consumer_key'][0]
consumer_secret = readin_authentication['consumer_secret'][0]
access_token = readin_authentication['access_token'][0]
access_token_secret = readin_authentication['access_token_secret'][0]
bearer_token = readin_authentication['beaker_token'][0]

# connect to twitter application 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
redirect_url = auth.get_authorization_url()
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit = True)

# Load Twitter Usernames

    Note:
    * Unvarified user's are not a problem, no one user can have the same ID
    * 'DayTradeWarrior' account was removed because twitter account was removed
    * Removing infrequent twitter users 
        'AswathDamodaran' min() 2010-01-25
        'cstewartcfa_twitter' min() 2013-06-19
        'BobPisani_twitter' min() 2015-11-04

In [56]:
with open(os.path.normpath(os.getcwd() + '/Stock_Market/user_list/user_list.xlsx'), 'rb') as f:
    user_df = pd.read_excel(f, sheet_name='user_names')
    user_df = user_df.where(pd.notnull(user_df), '')
    f.close()
user_df

Unnamed: 0,short_term,long_term,controversial
0,DanZanger,jimcramer,elonmusk
1,prrobbins,KennethLFisher,JeffBezos
2,markminervini,lei_zhang_lz,BillGates
3,bsc_daily,realwillmeade,
4,MITickWatcher,RayDalio,
5,OptionsProVol,GRDecter,
6,script_crypto,andrewrsorkin,
7,MarketMagnifier,EconguyRosie,
8,TwentyonTwenty_,,
9,WatcherGuru,,


## Download Tweets

### WARNING ~ 10 minutes
    If previously loaded SKIP to CHECKPOINT 
    * Download User tweets into csv spreadsheets 

- ( Tweepy limit of 3200 tweets per user )


    

In [57]:
groups = list(user_df.columns)
for group in groups:
    print(f"\n{group}:\n")
    users = list(user_df[group][user_df[group]!= ''])
    user_download(api, users, group)
    print(f"")

short_term:

DanZanger prrobbins markminervini bsc_daily MITickWatcher OptionsProVol script_crypto MarketMagnifier TwentyonTwenty_ WatcherGuru DipFinding MacroCharts techbudsolution eWhispers HindenburgRes JehoshaphatRsch ResearchGrizzly biancoresearch muddywatersre 
long_term:

jimcramer KennethLFisher lei_zhang_lz realwillmeade RayDalio GRDecter andrewrsorkin EconguyRosie 
controversial:

elonmusk JeffBezos BillGates 


## Merge Tweets

In [58]:
merge = []
for group in groups:
    merge.append(merge_files(group, display = 0))
df_short_term,df_long_term  = merge[0],merge[1]  
df_all = merge_all('merge/merged_twitter_users', display = 0)

size of merged data sets of short_term: (45963, 7)
size of merged data sets of long_term: (22543, 7)
size of merged data sets of controversial: (6405, 7)
size of merged data sets of merged_twitter_users: (74911, 7)


In [59]:
df_all.head(2)

Unnamed: 0,id,created_at,user,favorite_count,retweet_count,url,text
0,1620137574722576386,2023-01-30 19:11:13+00:00,WatcherGuru_twitter,188,48,https://twitter.com/i/web/status/1620137574722...,
1,1620132784416374784,2023-01-30 18:52:11+00:00,WatcherGuru_twitter,3176,731,https://twitter.com/i/web/status/1620132784416...,JUST IN Montenegros central bank partners with...


In [60]:
display(df_all.info(verbose = True, null_counts = None, show_counts=None))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74911 entries, 0 to 6404
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   id              74911 non-null  int64              
 1   created_at      74911 non-null  datetime64[ns, UTC]
 2   user            74911 non-null  object             
 3   favorite_count  74911 non-null  int64              
 4   retweet_count   74911 non-null  int64              
 5   url             74911 non-null  object             
 6   text            72672 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(3)
memory usage: 4.6+ MB


None

- Some Analysts have infrequent tweets and have 2k limit of tweets going farther back

In [61]:
df_all.groupby('user')['created_at'].min().sort_values(ascending= True).head(5)

user
BillGates_twitter         2013-07-03 17:04:11+00:00
JeffBezos_twitter         2015-11-24 11:14:26+00:00
DanZanger_twitter         2017-04-26 15:09:50+00:00
techbudsolution_twitter   2017-04-30 16:25:18+00:00
HindenburgRes_twitter     2017-07-31 17:54:11+00:00
EconguyRosie_twitter      2018-01-09 14:41:40+00:00
eWhispers_twitter         2018-03-12 13:51:17+00:00
lei_zhang_lz_twitter      2019-03-29 23:54:52+00:00
MacroCharts_twitter       2019-04-23 10:16:36+00:00
ResearchGrizzly_twitter   2019-04-26 07:52:59+00:00
andrewrsorkin_twitter     2019-05-10 11:01:21+00:00
script_crypto_twitter     2019-07-09 06:11:27+00:00
RayDalio_twitter          2020-04-17 14:57:16+00:00
muddywatersre_twitter     2021-01-27 03:33:03+00:00
prrobbins_twitter         2021-02-06 14:11:35+00:00
JehoshaphatRsch_twitter   2021-03-30 21:43:40+00:00
realwillmeade_twitter     2021-06-01 18:19:19+00:00
MarketMagnifier_twitter   2021-10-10 09:42:11+00:00
KennethLFisher_twitter    2021-10-25 15:39:10+00:00
jimcram

In [62]:
# Adding nonessential twitter words to remove
stop = nltk.corpus.stopwords.words("english") 
twitter_nonessential_words = ['twitter', 'birds','lists','list', 'source','just','am','pm'\
                              'a','b','c','d','e','f','g','h','i','j','k','l','m','n',\
                              'n','o','p','q','r','s','t','u','v','w','x','y','z']
stop.extend(twitter_nonessential_words) # merge two lists together
stop = sorted(list( dict.fromkeys(stop) )) # remove duplicates

### Create dictionarys of words 
* Remove unnecessary words
* Generate frequency of words per sentence

In [63]:
df_all_words = strip_all_words(df_all, stop)
df_all_words_count = df_all_words.explode().replace("", np.nan,regex=True).dropna() # drop NAN's and empty words
all_count = df_all_words_count.value_counts()

In [64]:
print(f"Tweets of Dictionaries: {len(df_all_words)}")
print(f"all words: {len(df_all_words_count)}")
print(f"Dictionary of all words: {len(all_count)}")

Tweets of Dictionaries: 74911
all words: 1033635
Dictionary of all words: 45789


In [65]:
print(f"5 words from dictionary of all words:\n{all_count[0:5]}", end='\n\n')

5 words from dictionary of all words:
stocks    8749
stock     8684
today     6069
score     5790
top       5633
Name: text, dtype: int64



In [66]:
print(f"All the words in each individual Sentence:\n{df_all_words[0:4]}")

All the words in each individual Sentence:
0                                                  NaN
1    [, montenegros, central, bank, partners, rippl...
2                                                  NaN
3    [, begins, applying, regulatory, licenses, ent...
Name: text, dtype: object


    Note the [','] and 'NaN' variables
* Nan is a placeholder for tweets w/ images
* [','] are words removed with special cases

# Probability small example

p = count(particular word in sentence) / (total particular word in all sentences) * 100 / (total of all unique words)

d{  hat:1, sandwich:2, lemon:1, orange:1, snorkle:1 }

n = LEN(d.KEYS())  -> n = 5

Tweet1: hat sandwich lemon 

Tweet2: snorkle sandwich orange 

Tweet1:

-> 1/1 * 100 + 1/2 * 100 + 1/1 * 100   
-> 100 + 50 + 100 = 250
-> 250/5 = 50%

Tweet2:

-> 1/1 * 100 + 1/2 * 100 + 1/1 * 100 
-> 100 + 50 + 100 
-> 250/5 = 50%

## Probability of individual tweets

In [67]:
# Probabilities
sentence_list, total_probability, individual_probability = sentence_word_probability(all_count, df_all_words)
print(f'sum of probability column = {sum(total_probability)}')

sum of probability column = 100.00000000000077


In [68]:
df_all_prob = df_all.reset_index()
df_all_prob['frequency'] = sentence_list
df_all_prob['probability'] = total_probability
df_all_prob = df_all_prob.dropna()
df_all_prob.insert(loc = 0, column = 'date', value = pd.to_datetime(df_all_prob['created_at']).dt.date.astype('datetime64[ns]'))
df_all_prob = df_all_prob.sort_values(by=['date'], ascending=False).drop(columns=['index'])

In [69]:
df_all_prob.head(2)

Unnamed: 0,date,id,created_at,user,favorite_count,retweet_count,url,text,frequency,probability
1,2023-01-30,1620132784416374784,2023-01-30 18:52:11+00:00,WatcherGuru_twitter,3176,731,https://twitter.com/i/web/status/1620132784416...,JUST IN Montenegros central bank partners with...,"[{'montenegros': 100.0, 'central': 0.729927007...",0.002369
19364,2023-01-30,1620131441345691649,2023-01-30 18:46:50+00:00,MITickWatcher_twitter,0,0,https://twitter.com/i/web/status/1620131441345...,Options Flow Stream Update TSLA GOOGL AAPL IWM...,"[{'options': 0.042844901456726654, 'flow': 0.0...",6.6e-05


In [70]:
df_wide1 = df_all_prob.pivot_table(index='date', values=['favorite_count','retweet_count'], aggfunc='sum',fill_value=0 ).sort_values(by='date',ascending=False)
df_wide2 = df_all_prob.pivot_table(index='date', columns=['user'], values=['probability'], aggfunc='sum',fill_value=0 ).sort_values(by='date',ascending=False).droplevel(0, axis=1) 
df_wide_merge = pd.merge(df_wide1, df_wide2, how='inner', on='date')

- Merging Sat/Sun Tweets to Monday and re-merging to data

In [71]:
# Drop Saturday-Monday And replace with Monday
week_end_mask = df_wide_merge.reset_index().date.dt.day_name().isin(['Saturday', 'Sunday', 'Monday'])
week_end = df_wide_merge.reset_index().loc[week_end_mask, :]
monday_group = week_end.groupby([pd.Grouper(key='date', freq='W-MON')])[df_wide_merge.columns].sum().reset_index('date')

df_wide_stripped = df_wide_merge.reset_index().loc[~ week_end_mask, :]
df_wide = pd.merge(df_wide_stripped, monday_group, how='outer').set_index('date')
df_wide.head(5)

Unnamed: 0_level_0,favorite_count,retweet_count,BillGates_twitter,DanZanger_twitter,DipFinding_twitter,EconguyRosie_twitter,GRDecter_twitter,HindenburgRes_twitter,JeffBezos_twitter,JehoshaphatRsch_twitter,KennethLFisher_twitter,MITickWatcher_twitter,MacroCharts_twitter,MarketMagnifier_twitter,OptionsProVol_twitter,RayDalio_twitter,ResearchGrizzly_twitter,TwentyonTwenty__twitter,WatcherGuru_twitter,andrewrsorkin_twitter,biancoresearch_twitter,bsc_daily_twitter,eWhispers_twitter,elonmusk_twitter,jimcramer_twitter,lei_zhang_lz_twitter,markminervini_twitter,muddywatersre_twitter,prrobbins_twitter,realwillmeade_twitter,script_crypto_twitter,techbudsolution_twitter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
2023-01-27,1278718,142111,4.9e-05,0.0,0.097707,0.0,0.033271,0.0,0.0,0.0,0.034505,0.1819,0.0,0.0,0.016015,0.001596,0.0,0.033076,0.008787,0.0,0.005711,0.024401,0.013924,0.017579,0.006725,0.007637,0.0,0.002215,0.002423,0.0,0.00195,0.006427
2023-01-26,1040114,77156,0.0,0.0,0.197029,0.005456,0.027104,4.1e-05,0.0,0.0,0.011706,0.29936,0.0,0.0,0.019391,0.008844,0.0,0.040344,0.001391,0.0,0.00432,0.012394,0.010334,0.018335,0.008085,0.0,0.00583,0.0,0.005106,0.0,0.004298,0.0
2023-01-25,2242837,200328,0.001602,0.0,0.157943,0.0,0.01795,0.209883,0.0,0.0,0.002921,0.271388,0.0,0.0,0.01807,0.003522,0.0,0.02582,0.004947,0.0,0.028554,0.008567,0.015599,0.022848,0.028921,0.0,0.003004,3.9e-05,0.0001,0.010597,0.005929,0.001175
2023-01-24,216155,32984,0.008563,0.000921,0.116539,0.00457,0.018339,0.000156,0.0,0.0,0.007047,0.235193,0.0,0.0,0.018989,0.002384,0.0,0.03518,0.007654,0.0,0.005167,0.014348,0.029109,0.005057,0.00751,0.002236,0.0,0.001496,0.002897,0.001047,0.003115,0.004672
2023-01-20,1184608,107340,0.0,0.0,0.102943,0.002073,0.026789,0.0,0.0,0.0,0.012116,0.162795,0.0,0.0,0.017036,0.00451,0.0,0.041354,0.007347,0.0,0.00251,0.032797,0.001549,0.035256,0.022436,7.2e-05,0.006296,0.007504,0.001406,0.004088,0.00113,0.0


In [80]:
path_all_merged_twitter_analysts_pivot = f'../Sentiment_Analysis/Stock_Market/data/merge/all_merged_twitter_users' # Create Folders
if not os.path.exists(path_all_merged_twitter_analysts_pivot):
    os.makedirs(path_all_merged_twitter_analysts_pivot)
df_wide.to_csv(path_all_merged_twitter_analysts_pivot +'/all_merged_twitter_users_pivot.csv', index=True) # Export to csv
df_wide.head(5)

Unnamed: 0_level_0,favorite_count,retweet_count,BillGates_twitter,DanZanger_twitter,DipFinding_twitter,EconguyRosie_twitter,GRDecter_twitter,HindenburgRes_twitter,JeffBezos_twitter,JehoshaphatRsch_twitter,KennethLFisher_twitter,MITickWatcher_twitter,MacroCharts_twitter,MarketMagnifier_twitter,OptionsProVol_twitter,RayDalio_twitter,ResearchGrizzly_twitter,TwentyonTwenty__twitter,WatcherGuru_twitter,andrewrsorkin_twitter,biancoresearch_twitter,bsc_daily_twitter,eWhispers_twitter,elonmusk_twitter,jimcramer_twitter,lei_zhang_lz_twitter,markminervini_twitter,muddywatersre_twitter,prrobbins_twitter,realwillmeade_twitter,script_crypto_twitter,techbudsolution_twitter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
2023-01-27,1278718,142111,4.9e-05,0.0,0.097707,0.0,0.033271,0.0,0.0,0.0,0.034505,0.1819,0.0,0.0,0.016015,0.001596,0.0,0.033076,0.008787,0.0,0.005711,0.024401,0.013924,0.017579,0.006725,0.007637,0.0,0.002215,0.002423,0.0,0.00195,0.006427
2023-01-26,1040114,77156,0.0,0.0,0.197029,0.005456,0.027104,4.1e-05,0.0,0.0,0.011706,0.29936,0.0,0.0,0.019391,0.008844,0.0,0.040344,0.001391,0.0,0.00432,0.012394,0.010334,0.018335,0.008085,0.0,0.00583,0.0,0.005106,0.0,0.004298,0.0
2023-01-25,2242837,200328,0.001602,0.0,0.157943,0.0,0.01795,0.209883,0.0,0.0,0.002921,0.271388,0.0,0.0,0.01807,0.003522,0.0,0.02582,0.004947,0.0,0.028554,0.008567,0.015599,0.022848,0.028921,0.0,0.003004,3.9e-05,0.0001,0.010597,0.005929,0.001175
2023-01-24,216155,32984,0.008563,0.000921,0.116539,0.00457,0.018339,0.000156,0.0,0.0,0.007047,0.235193,0.0,0.0,0.018989,0.002384,0.0,0.03518,0.007654,0.0,0.005167,0.014348,0.029109,0.005057,0.00751,0.002236,0.0,0.001496,0.002897,0.001047,0.003115,0.004672
2023-01-20,1184608,107340,0.0,0.0,0.102943,0.002073,0.026789,0.0,0.0,0.0,0.012116,0.162795,0.0,0.0,0.017036,0.00451,0.0,0.041354,0.007347,0.0,0.00251,0.032797,0.001549,0.035256,0.022436,7.2e-05,0.006296,0.007504,0.001406,0.004088,0.00113,0.0


## Note
    
- Could fill index fund weekend data with previous close on Friday with " resample('D').ffill() "

In [73]:
# downloding index fund's or stock tickers  #.resample('D').ffill()
how_far_back = df_wide.index.min().date()
today = date.today()

stock_list = ['^GSPC', '^IXIC', '^DJI', '^RUT']
stock_str = ' '.join( stock_list )
index_funds_df = yf.download(stock_str, how_far_back, today, interval = '1d')['Close'].reset_index('Date').rename(columns= {'Date':'date',
                                                                                                        '^GSPC': 'SandP_500',
                                                                                                        '^IXIC': 'NASDAQ',
                                                                                                        '^RUT': 'RUSSEL',
                                                                                                         '^DJI': 'DOW_JONES'})
index_funds_df
convert_dict = {'date': 'datetime64[ns]',
                'SandP_500': 'float64',
                'NASDAQ':'float64',
                'DOW_JONES': 'float64',
                'RUSSEL': 'float64'}
index_funds_df = index_funds_df.astype(convert_dict)

print(f'today: {today} -> to {how_far_back}')


[*********************100%***********************]  4 of 4 completed
today: 2023-01-30 -> to 2013-07-03


In [74]:
path_index_funds_merge = f'../Sentiment_Analysis/Stock_Market/data/merge/all_merged_index_funds' # Create Folders
if not os.path.exists(path_index_funds_merge):
    os.makedirs(path_index_funds_merge)
index_funds_df.to_csv(path_index_funds_merge +'/all_merged_index_funds.csv', index=False) # Export to csv
index_funds_df.head(5)

Unnamed: 0,date,DOW_JONES,SandP_500,NASDAQ,RUSSEL
0,2013-07-03,14988.370117,1615.410034,3443.669922,991.130005
1,2013-07-05,15135.839844,1631.890015,3479.379883,1005.390015
2,2013-07-08,15224.69043,1640.459961,3484.830078,1009.25
3,2013-07-09,15300.339844,1652.319946,3504.26001,1018.049988
4,2013-07-10,15291.660156,1652.619995,3520.76001,1020.419983


In [83]:
# Merging the probabilities of words used from twitter and database of index funds
df_merge = pd.merge(index_funds_df, df_wide, how='inner', on='date').set_index('date')
df_merge_original = df_merge.copy()
# Want to Normalize the index funds and favorite_count / retweet_count
df_merge.DOW_JONES = (df_merge.DOW_JONES - df_merge.DOW_JONES.min()) / (df_merge.DOW_JONES.max() - df_merge.DOW_JONES.min())   
df_merge.SandP_500 = (df_merge.SandP_500 - df_merge.SandP_500.min()) / (df_merge.SandP_500.max() - df_merge.SandP_500.min())
df_merge.NASDAQ = (df_merge.NASDAQ - df_merge.NASDAQ.min()) / (df_merge.NASDAQ.max() - df_merge.NASDAQ.min())
df_merge.RUSSEL = (df_merge.RUSSEL - df_merge.RUSSEL.min()) / (df_merge.RUSSEL.max() - df_merge.RUSSEL.min())
df_merge.favorite_count = (df_merge.favorite_count - df_merge.favorite_count.min()) / (df_merge.favorite_count.max() - df_merge.favorite_count.min())
df_merge.retweet_count = (df_merge.retweet_count - df_merge.retweet_count.min()) / (df_merge.retweet_count.max() - df_merge.retweet_count.min())

In [76]:
path_twitter_and_index_fund = f'../Sentiment_Analysis/Stock_Market/data/merge/combined'
if not os.path.exists(path_twitter_and_index_fund):
    os.makedirs(path_twitter_and_index_fund)
df_merge.to_csv(path_twitter_and_index_fund +'/index_funds_and_twitter_analysts.csv') # Export to csv

### CHECKPOINT    
    Load Merged Twitter and Index fund df

In [77]:
path_twitter_and_index_fund = f'../Sentiment_Analysis/Stock_Market/data/merge/combined'
df_merge = pd.read_csv(path_twitter_and_index_fund +'/index_funds_and_twitter_analysts.csv').set_index('date')
df_merge.head()

Unnamed: 0_level_0,DOW_JONES,SandP_500,NASDAQ,RUSSEL,favorite_count,retweet_count,BillGates_twitter,DanZanger_twitter,DipFinding_twitter,EconguyRosie_twitter,GRDecter_twitter,HindenburgRes_twitter,JeffBezos_twitter,JehoshaphatRsch_twitter,KennethLFisher_twitter,MITickWatcher_twitter,MacroCharts_twitter,MarketMagnifier_twitter,OptionsProVol_twitter,RayDalio_twitter,ResearchGrizzly_twitter,TwentyonTwenty__twitter,WatcherGuru_twitter,andrewrsorkin_twitter,biancoresearch_twitter,bsc_daily_twitter,eWhispers_twitter,elonmusk_twitter,jimcramer_twitter,lei_zhang_lz_twitter,markminervini_twitter,muddywatersre_twitter,prrobbins_twitter,realwillmeade_twitter,script_crypto_twitter,techbudsolution_twitter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2013-07-03,0.009619,0.0,0.0,0.025124,1.1e-05,0.000412,0.001857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-10,0.02339,0.011697,0.006112,0.044795,0.000102,0.001489,0.00214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-15,0.032136,0.02109,0.012987,0.06016,2.5e-05,0.000612,0.00695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-16,0.030664,0.019128,0.012275,0.057105,1.3e-05,0.000285,0.002847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-18,0.035055,0.023249,0.013288,0.064841,1e-05,0.000163,0.002232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Prediction of Today 
* ( Between 0 & 1)

In [84]:
# Model Build For Each index fund
df_SandP_500 = create_target(df_merge.copy(), day = 5, ticker = "SandP_500" ) # Create a y value for 5 day averages
lm_SandP_500 = linear_model(df_SandP_500, split=0.20, summary = False)

df_DOW_JONES = create_target(df_merge.copy(), day = 5, ticker = "DOW_JONES" ) # Create a y value for 5 day averages
lm_DOW_JONES = linear_model(df_DOW_JONES, split=0.20, summary = False)

df_NASDAQ = create_target(df_merge.copy(), day = 5, ticker = "NASDAQ" ) # Create a y value for 5 day averages
lm_NASDAQ = linear_model(df_NASDAQ, split=0.20, summary = False)

df_RUSSEL = create_target(df_merge.copy(), day = 5, ticker = "RUSSEL" ) # Create a y value for 5 day averages
lm_RUSSEL = linear_model(df_RUSSEL, split=0.20, summary = False)

# Todays Data
path_all_merged_twitter_users_pivot = f'../Sentiment_Analysis/Stock_Market/data/merge/all_merged_twitter_users'
df_wide = pd.read_csv(path_all_merged_twitter_users_pivot +'/all_merged_twitter_users_pivot.csv').astype({'date': 'datetime64[ns]'}).set_index('date')
stock_list = ['^GSPC', '^IXIC', '^DJI', '^RUT']
todays_test = download_todays_test(stock_list, df_wide, df_merge_original)
Xnew = sm.add_constant(todays_test, has_constant='add')

SandP_500_pred = lm_SandP_500.predict(Xnew)
DOW_JONES_pred = lm_DOW_JONES.predict(Xnew)
NASDAQ_pred = lm_NASDAQ.predict(Xnew)
RUSSEL_pred = lm_RUSSEL.predict(Xnew)

print(f"date: { SandP_500_pred.index[0].date() }\n\
SandP_500: { SandP_500_pred[0] }\n\
DOW_JONES: { DOW_JONES_pred[0] }\n\
NASDAQ: { NASDAQ_pred[0] }\n\
RUSSEL: { RUSSEL_pred[0] }")

[*********************100%***********************]  4 of 4 completed
date: 2023-01-30
SandP_500: 0.8854612169353373
DOW_JONES: 0.8706127469957934
NASDAQ: 0.7509534756650527
RUSSEL: 0.883371027673495
