# Import Libraries

In [1]:
import os,sys,re,glob,ipykernel,tweepy,stockmarket,nltk,collections,itertools,pandas as pd,numpy as np,\
        seaborn as sns, yfinance as yf, matplotlib.pyplot as plt, statsmodels.formula.api as smf,\
        statsmodels.api as sm, autoreload, importlib
from pathlib import Path
from string import punctuation 
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
np.random.seed(0)
pd.set_option('display.max_columns', None)

## Set Working Directory: 
    
* /Sentiment_Analysis 
    
* __ file __ isn't available in jupyter notebooks

    

In [2]:
file = os.getcwd().split(os.sep)
while(file[-1] != 'Sentiment_Analysis'): # Check the working directory
    os.chdir('..')
    file = os.getcwd().split(os.sep)
    sys.path.append(os.path.abspath(os.getcwd()))
print(f"root directory: {os.getcwd()}", sep = '\n')

root directory: c:\Code\Public_Github\Sentiment_Analysis


## Load Custom Functions

In [3]:
from src import user_download_helper, user_download, merge_files, merge_all, \
                strip_all_words, sentence_word_probability, download_todays_test, \
                format_model,linear_model, naive_bayes, create_target, normalize_columns, normalize_columns_target

# Twitter API Credentials

In [4]:
# Read in keys from a csv file
autentication_path = os.path.abspath('../Sentiment_Analysis/Stock_Market/authentication/authentication_tokens.csv')
readin_authentication = pd.read_csv(autentication_path, header=0, sep=',')

consumer_key = readin_authentication['consumer_key'][0]
consumer_secret = readin_authentication['consumer_secret'][0]
access_token = readin_authentication['access_token'][0]
access_token_secret = readin_authentication['access_token_secret'][0]
bearer_token = readin_authentication['beaker_token'][0]

# connect to twitter application 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
redirect_url = auth.get_authorization_url()
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit = True)

# Load Twitter Usernames

    Note:
    * Unvarified user's are not a problem, no one user can have the same ID
    
<div style="padding-left: 50px;">

| Removed User's | reason | 
| ------------ | ------------- |
|DayTradeWarrior|account removed |
|AswathDamodaran |2013-06-19 |
|cstewartcfa_twitter |2013-06-19|
|BobPisani_twitter |2015-11-04| 
|elonmusk|private|
</div>



In [5]:
with open(os.path.normpath(os.getcwd() + '/Stock_Market/user_list/user_list.xlsx'), 'rb') as f:
    user_df = pd.read_excel(f, sheet_name='user_names')
    user_df = user_df.where(pd.notnull(user_df), '')
    f.close()
groups = list(user_df.columns)
user_df

Unnamed: 0,short_term,long_term,controversial
0,DanZanger,jimcramer,JeffBezos
1,prrobbins,KennethLFisher,BillGates
2,markminervini,lei_zhang_lz,
3,bsc_daily,realwillmeade,
4,MITickWatcher,RayDalio,
5,OptionsProVol,GRDecter,
6,script_crypto,andrewrsorkin,
7,MarketMagnifier,EconguyRosie,
8,TwentyonTwenty_,,
9,WatcherGuru,,


## Download Tweets

### WARNING ~ 10 minutes
    If previously loaded SKIP to CHECKPOINT 
    * Download User tweets into csv spreadsheets 

- ( Tweepy limit of 3200 tweets per user )


    

In [8]:
for group in groups:
    print(f"\n{group}:\n")
    users = list(user_df[group][user_df[group]!= ''])
    user_download(api, users, group)
    print(f"")


short_term:



KeyboardInterrupt: 

## Merge Tweets

In [6]:
merge = []
for group in groups:
    merge.append(merge_files(group, display = 0))
df_short_term,df_long_term  = merge[0],merge[1]  
df_all = merge_all('merge/merged_twitter_users', display = 0)

size of merged data sets of short_term: (45975, 7)
size of merged data sets of long_term: (22559, 7)
size of merged data sets of controversial: (3387, 7)
size of merged data sets of merged_twitter_users: (71921, 7)


In [19]:
df_all.head(2)

Unnamed: 0,id,created_at,user,favorite_count,retweet_count,url,text
0,1620628855005511680,2023-01-31 22:43:23-05:00,WatcherGuru_twitter,190,33,https://twitter.com/i/web/status/1620628855005...,
1,1620579693983600640,2023-01-31 19:28:02-05:00,WatcherGuru_twitter,6617,861,https://twitter.com/i/web/status/1620579693983...,JUST IN Jim Cramer says were in a bull market ...


In [8]:
display(df_all.info(verbose = True, null_counts = None, show_counts=None))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71921 entries, 0 to 3386
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              71921 non-null  int64 
 1   created_at      71921 non-null  object
 2   user            71921 non-null  object
 3   favorite_count  71921 non-null  int64 
 4   retweet_count   71921 non-null  int64 
 5   url             71921 non-null  object
 6   text            70049 non-null  object
dtypes: int64(3), object(4)
memory usage: 4.4+ MB


None

- Some Analysts have infrequent tweets and have 2k limit of tweets going farther back

In [20]:
df_all.groupby('user')['created_at'].min().sort_values(ascending= True).head(5)

user
BillGates_twitter          2013-07-03 13:04:11-04:00
JeffBezos_twitter          2015-11-24 06:14:26-05:00
DanZanger_twitter          2017-04-26 11:09:50-04:00
techbudsolution_twitter    2017-04-30 12:25:18-04:00
HindenburgRes_twitter      2017-07-31 13:54:11-04:00
Name: created_at, dtype: object

In [21]:
# Adding nonessential twitter words to remove
stop = nltk.corpus.stopwords.words("english") 
twitter_nonessential_words = ['twitter', 'birds','lists','list', 'source','just','am','pm'\
                              'a','b','c','d','e','f','g','h','i','j','k','l','m','n',\
                              'n','o','p','q','r','s','t','u','v','w','x','y','z']
stop.extend(twitter_nonessential_words) # merge two lists together
stop = sorted(list( dict.fromkeys(stop) )) # remove duplicates

### Create dictionarys of words 
* Remove unnecessary words
* Generate frequency of words per sentence

In [22]:
df_all_words = strip_all_words(df_all, stop)
df_all_words_count = df_all_words.explode().replace("", np.nan, regex=True).dropna() # drop NAN's and empty words
all_count = df_all_words_count.value_counts()

In [23]:
print(f"Tweets of Dictionaries: {len(df_all_words)}")
print(f"all words: {len(df_all_words_count)}")
print(f"Dictionary of all words: {len(all_count)}")

Tweets of Dictionaries: 71921
all words: 1017652
Dictionary of all words: 45033


In [24]:
print(f"5 words from dictionary of all words:\n{all_count[0:5]}", end='\n\n')

5 words from dictionary of all words:
stocks    8762
stock     8673
today     6048
score     5825
top       5630
Name: text, dtype: int64



In [25]:
print(f"All the words in each individual Sentence:\n{df_all_words[0:4]}")

All the words in each individual Sentence:
0                                                  NaN
1        [, jim, cramer, says, bull, market, buy, dip]
2                                                  NaN
3    [, chatgpt, creator, openai, releases, tool, d...
Name: text, dtype: object


    Note the [','] and 'NaN' variables
* Nan is a placeholder for tweets w/ images
* [','] are words removed with special cases

# Probability small example

p = count(particular word in sentence) / (total particular word in all sentences) * 100 / (total of all unique words)

d{  hat:1, sandwich:2, lemon:1, orange:1, snorkle:1 }

n = LEN(d.KEYS())  -> n = 5

Tweet1: hat sandwich lemon 

Tweet2: snorkle sandwich orange 

Tweet1:

-> 1/1 * 100 + 1/2 * 100 + 1/1 * 100   
-> 100 + 50 + 100 = 250
-> 250/5 = 50%

Tweet2:

-> 1/1 * 100 + 1/2 * 100 + 1/1 * 100 
-> 100 + 50 + 100 
-> 250/5 = 50%

## Probability of individual tweets

In [26]:
# Probabilities
sentence_list, total_probability, individual_probability = sentence_word_probability(all_count, df_all_words)
print(f'sum of probability column = {sum(total_probability)}')

sum of probability column = 99.99999999999832


In [48]:
df_all_prob = df_all.reset_index()
df_all_prob['frequency'] = sentence_list
df_all_prob['probability'] = total_probability
df_all_prob = df_all_prob.dropna()
df_all_prob.insert(loc = 0, column = 'date', value = pd.to_datetime(df_all_prob['created_at']).apply(lambda x: x.strftime('%Y-%m-%d')))
df_all_prob.date = pd.to_datetime(df_all_prob['date'], format='%Y-%m-%d')
df_all_prob = df_all_prob.sort_values(by=['date'], ascending=False).drop(columns=['index'])

In [54]:
df_all_prob.head(2)

Unnamed: 0,date,id,created_at,user,favorite_count,retweet_count,url,text,frequency,probability
1,2023-01-31,1620579693983600640,2023-01-31 19:28:02-05:00,WatcherGuru_twitter,6617,861,https://twitter.com/i/web/status/1620579693983...,JUST IN Jim Cramer says were in a bull market ...,"[{'jim': 1.282051282051282, 'cramer': 1.587301...",7.9e-05
3228,2023-01-31,1620600720813862912,2023-01-31 20:51:35-05:00,TwentyonTwenty__twitter,0,0,https://twitter.com/i/web/status/1620600720813...,HILS has trended 96 times in the past 24 hours...,"[{'hils': 2.4390243902439024, 'trended': 0.041...",0.000673


In [55]:
df_wide1 = df_all_prob.pivot_table(index='date', values=['favorite_count','retweet_count'], aggfunc='sum',fill_value=0 ).sort_values(by='date',ascending=False)
df_wide2 = df_all_prob.pivot_table(index='date', columns=['user'], values=['probability'], aggfunc='sum',fill_value=0 ).sort_values(by='date',ascending=False).droplevel(0, axis=1) 
df_wide_merge = pd.merge(df_wide1, df_wide2, how='inner', on='date')

- Merging Sat/Sun Tweets to Monday and re-merging to data

In [56]:
# Drop Saturday-Monday And replace with Monday
week_end_mask = df_wide_merge.reset_index().date.dt.day_name().isin(['Saturday', 'Sunday', 'Monday'])
week_end = df_wide_merge.reset_index().loc[week_end_mask, :]
monday_group = week_end.groupby([pd.Grouper(key='date', freq='W-MON')])[df_wide_merge.columns].sum().reset_index('date')

df_wide_stripped = df_wide_merge.reset_index().loc[~ week_end_mask, :]
df_wide = pd.merge(df_wide_stripped, monday_group, how='outer').set_index('date')
df_wide.head(5)

Unnamed: 0_level_0,favorite_count,retweet_count,BillGates_twitter,DanZanger_twitter,DipFinding_twitter,EconguyRosie_twitter,GRDecter_twitter,HindenburgRes_twitter,JeffBezos_twitter,JehoshaphatRsch_twitter,KennethLFisher_twitter,MITickWatcher_twitter,MacroCharts_twitter,MarketMagnifier_twitter,OptionsProVol_twitter,RayDalio_twitter,ResearchGrizzly_twitter,TwentyonTwenty__twitter,WatcherGuru_twitter,andrewrsorkin_twitter,biancoresearch_twitter,bsc_daily_twitter,eWhispers_twitter,jimcramer_twitter,lei_zhang_lz_twitter,markminervini_twitter,muddywatersre_twitter,prrobbins_twitter,realwillmeade_twitter,script_crypto_twitter,techbudsolution_twitter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2023-01-31,88960,11544,0.001227,0.0,0.098042,0.0,0.047432,0.0,0.004775,0.0,0.016196,0.250276,0.0,0.0,0.015573,0.00031,0.0,0.033493,0.005971,0.0,0.003709,0.035104,0.013689,0.005601,0.000203,0.001552,0.0,0.000162,0.044295,0.004937,0.005519
2023-01-27,104909,18089,5e-05,0.0,0.100147,0.0,0.036253,0.0,0.0,0.0,0.03655,0.177148,0.0,0.0,0.016302,0.001641,0.0,0.033853,0.007488,0.0,0.00717,0.024795,0.012019,0.00718,0.008023,0.0,0.002252,0.002466,0.0,0.00198,0.006539
2023-01-26,150375,26356,0.0,0.0,0.200485,0.005575,0.028691,4.2e-05,0.0,0.0,0.011991,0.305759,0.0,0.0,0.019718,0.009066,0.0,0.020702,0.005153,0.0,0.004399,0.012587,0.012938,0.008882,0.0,0.005972,0.0,0.003479,0.0,0.004369,0.0
2023-01-25,85699,13236,0.001683,0.0,0.159028,0.0,0.014698,0.0,0.0,0.0,0.002979,0.273836,0.0,0.0,0.018296,0.0037,0.0,0.046427,0.00273,0.0,0.028858,0.008757,0.015866,0.028819,0.0,0.003097,0.0,0.001806,0.009745,0.00605,0.000928
2023-01-24,285079,69954,0.008739,0.000975,0.118665,0.00466,0.021256,0.215444,0.0,0.0,0.007183,0.232013,0.0,0.0,0.018679,0.002453,0.0,0.02891,0.007292,0.0,0.005548,0.014595,0.018616,0.009024,0.002274,0.0,0.000378,0.002954,0.001078,0.003176,0.005032


In [57]:
path_all_merged_twitter_analysts_pivot = f'../Sentiment_Analysis/Stock_Market/data/merge/all_merged_twitter_users' # Create Folders
if not os.path.exists(path_all_merged_twitter_analysts_pivot):
    os.makedirs(path_all_merged_twitter_analysts_pivot)
df_wide.to_csv(path_all_merged_twitter_analysts_pivot +'/all_merged_twitter_users_pivot.csv', index=True) # Export to csv

df_wide.head(5)

Unnamed: 0_level_0,favorite_count,retweet_count,BillGates_twitter,DanZanger_twitter,DipFinding_twitter,EconguyRosie_twitter,GRDecter_twitter,HindenburgRes_twitter,JeffBezos_twitter,JehoshaphatRsch_twitter,KennethLFisher_twitter,MITickWatcher_twitter,MacroCharts_twitter,MarketMagnifier_twitter,OptionsProVol_twitter,RayDalio_twitter,ResearchGrizzly_twitter,TwentyonTwenty__twitter,WatcherGuru_twitter,andrewrsorkin_twitter,biancoresearch_twitter,bsc_daily_twitter,eWhispers_twitter,jimcramer_twitter,lei_zhang_lz_twitter,markminervini_twitter,muddywatersre_twitter,prrobbins_twitter,realwillmeade_twitter,script_crypto_twitter,techbudsolution_twitter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2023-01-31,88960,11544,0.001227,0.0,0.098042,0.0,0.047432,0.0,0.004775,0.0,0.016196,0.250276,0.0,0.0,0.015573,0.00031,0.0,0.033493,0.005971,0.0,0.003709,0.035104,0.013689,0.005601,0.000203,0.001552,0.0,0.000162,0.044295,0.004937,0.005519
2023-01-27,104909,18089,5e-05,0.0,0.100147,0.0,0.036253,0.0,0.0,0.0,0.03655,0.177148,0.0,0.0,0.016302,0.001641,0.0,0.033853,0.007488,0.0,0.00717,0.024795,0.012019,0.00718,0.008023,0.0,0.002252,0.002466,0.0,0.00198,0.006539
2023-01-26,150375,26356,0.0,0.0,0.200485,0.005575,0.028691,4.2e-05,0.0,0.0,0.011991,0.305759,0.0,0.0,0.019718,0.009066,0.0,0.020702,0.005153,0.0,0.004399,0.012587,0.012938,0.008882,0.0,0.005972,0.0,0.003479,0.0,0.004369,0.0
2023-01-25,85699,13236,0.001683,0.0,0.159028,0.0,0.014698,0.0,0.0,0.0,0.002979,0.273836,0.0,0.0,0.018296,0.0037,0.0,0.046427,0.00273,0.0,0.028858,0.008757,0.015866,0.028819,0.0,0.003097,0.0,0.001806,0.009745,0.00605,0.000928
2023-01-24,285079,69954,0.008739,0.000975,0.118665,0.00466,0.021256,0.215444,0.0,0.0,0.007183,0.232013,0.0,0.0,0.018679,0.002453,0.0,0.02891,0.007292,0.0,0.005548,0.014595,0.018616,0.009024,0.002274,0.0,0.000378,0.002954,0.001078,0.003176,0.005032


### CHECKPOINT    
    Load pivot data

In [6]:
path_all_merged_twitter_analysts_pivot = f'../Sentiment_Analysis/Stock_Market/data/merge/all_merged_twitter_users'
df_wide = pd.read_csv(path_all_merged_twitter_analysts_pivot +'/all_merged_twitter_users_pivot.csv').astype({'date':'datetime64[ns]'}).set_index('date')

In [7]:
with open(os.path.normpath(os.getcwd() + '/Stock_Market/ticker_list/ticker_list.xlsx'), 'rb') as f:
    ticker_df = pd.read_excel(f, sheet_name='ticker_sheet')
    ticker_df = ticker_df.where(pd.notnull(ticker_df), '')
    f.close()
ticker_df

Unnamed: 0,ticker_name,ticker_label
0,^GSPC,SandP_500
1,^IXIC,NASDAQ
2,^RUT,RUSSEL
3,^DJI,DOW_JONES


In [8]:
# downloding index fund's or stock tickers  #.resample('D').ffill()
how_far_back = df_wide.index.min().date()
today = date.today()
column_names = dict(zip(ticker_df.ticker_name, ticker_df.ticker_label))
column_names['Date']='date'
stock_list = list(ticker_df.ticker_name)
stock_str = ' '.join( stock_list )

index_funds_df = yf.download(stock_str, how_far_back, today, interval = '1d', progress=False)['Close'].reset_index('Date').rename(columns=column_names)

convert_dict = dict(zip(ticker_df.ticker_label, ['float64']*len(ticker_df.ticker_label)))
convert_dict['date'] = 'datetime64[ns]'
index_funds_df = index_funds_df.astype(convert_dict)

print(f'{how_far_back} -> {today}')


2013-07-03 -> 2023-02-01


In [9]:
path_index_funds_merge = f'../Sentiment_Analysis/Stock_Market/data/merge/all_merged_index_funds' # Create Folders
if not os.path.exists(path_index_funds_merge):
    os.makedirs(path_index_funds_merge)
index_funds_df.to_csv(path_index_funds_merge +'/all_merged_index_funds.csv', index=False) # Export to csv
index_funds_df.head(5)

Unnamed: 0,date,DOW_JONES,SandP_500,NASDAQ,RUSSEL
0,2013-07-03,14988.370117,1615.410034,3443.669922,991.130005
1,2013-07-05,15135.839844,1631.890015,3479.379883,1005.390015
2,2013-07-08,15224.69043,1640.459961,3484.830078,1009.25
3,2013-07-09,15300.339844,1652.319946,3504.26001,1018.049988
4,2013-07-10,15291.660156,1652.619995,3520.76001,1020.419983


In [10]:
# Merging the probabilities of words used from twitter and database of index funds
df_merge = pd.merge(index_funds_df, df_wide, how='inner', on='date').set_index('date')
df_merge_original = df_merge.copy()

columns = list(ticker_df.ticker_label) + ['favorite_count', 'retweet_count']
df_merge = normalize_columns(df_merge.copy(), columns)
df_merge.tail(5)

Unnamed: 0_level_0,DOW_JONES,SandP_500,NASDAQ,RUSSEL,favorite_count,retweet_count,BillGates_twitter,DanZanger_twitter,DipFinding_twitter,EconguyRosie_twitter,GRDecter_twitter,HindenburgRes_twitter,JeffBezos_twitter,JehoshaphatRsch_twitter,KennethLFisher_twitter,MITickWatcher_twitter,MacroCharts_twitter,MarketMagnifier_twitter,OptionsProVol_twitter,RayDalio_twitter,ResearchGrizzly_twitter,TwentyonTwenty__twitter,WatcherGuru_twitter,andrewrsorkin_twitter,biancoresearch_twitter,bsc_daily_twitter,eWhispers_twitter,jimcramer_twitter,lei_zhang_lz_twitter,markminervini_twitter,muddywatersre_twitter,prrobbins_twitter,realwillmeade_twitter,script_crypto_twitter,techbudsolution_twitter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
2023-01-25,0.861245,0.754699,0.623897,0.629004,0.171341,0.153948,0.001683,0.0,0.159028,0.0,0.014698,0.0,0.0,0.0,0.002979,0.273836,0.0,0.0,0.018296,0.0037,0.0,0.046427,0.00273,0.0,0.028858,0.008757,0.015866,0.028819,0.0,0.003097,0.0,0.001806,0.009745,0.00605,0.000928
2023-01-26,0.87058,0.768596,0.639677,0.63756,0.300651,0.306547,0.0,0.0,0.200485,0.005575,0.028691,4.2e-05,0.0,0.0,0.011991,0.305759,0.0,0.0,0.019718,0.009066,0.0,0.020702,0.005153,0.0,0.004399,0.012587,0.012938,0.008882,0.0,0.005972,0.0,0.003479,0.0,0.004369,0.0
2023-01-27,0.871881,0.771781,0.648342,0.643202,0.209749,0.210393,5e-05,0.0,0.100147,0.0,0.036253,0.0,0.0,0.0,0.03655,0.177148,0.0,0.0,0.016302,0.001641,0.0,0.033853,0.007488,0.0,0.00717,0.024795,0.012019,0.00718,0.008023,0.0,0.002252,0.002466,0.0,0.00198,0.006539
2023-01-30,0.860031,0.755186,0.630275,0.625915,0.448162,0.57584,0.0,3.8e-05,0.063835,0.0,0.033161,0.004541,0.000379,0.0,0.018193,0.25885,0.0,0.0,0.018314,0.005288,0.0,0.094105,0.006956,4e-06,0.007009,0.103975,0.016783,0.018751,0.007512,0.00227,0.02083,0.004558,0.069552,0.004363,0.002362
2023-01-31,0.876784,0.773679,0.645396,0.656959,0.177861,0.134268,0.001227,0.0,0.098042,0.0,0.047432,0.0,0.004775,0.0,0.016196,0.250276,0.0,0.0,0.015573,0.00031,0.0,0.033493,0.005971,0.0,0.003709,0.035104,0.013689,0.005601,0.000203,0.001552,0.0,0.000162,0.044295,0.004937,0.005519


In [11]:
path_twitter_and_index_fund = f'../Sentiment_Analysis/Stock_Market/data/merge/combined'
if not os.path.exists(path_twitter_and_index_fund):
    os.makedirs(path_twitter_and_index_fund)
df_merge.to_csv(path_twitter_and_index_fund +'/index_funds_and_twitter_analysts.csv') # Export to csv

In [12]:
path_twitter_and_index_fund = f'../Sentiment_Analysis/Stock_Market/data/merge/combined'
df_merge = pd.read_csv(path_twitter_and_index_fund +'/index_funds_and_twitter_analysts.csv').set_index('date')
df_merge.head()

Unnamed: 0_level_0,DOW_JONES,SandP_500,NASDAQ,RUSSEL,favorite_count,retweet_count,BillGates_twitter,DanZanger_twitter,DipFinding_twitter,EconguyRosie_twitter,GRDecter_twitter,HindenburgRes_twitter,JeffBezos_twitter,JehoshaphatRsch_twitter,KennethLFisher_twitter,MITickWatcher_twitter,MacroCharts_twitter,MarketMagnifier_twitter,OptionsProVol_twitter,RayDalio_twitter,ResearchGrizzly_twitter,TwentyonTwenty__twitter,WatcherGuru_twitter,andrewrsorkin_twitter,biancoresearch_twitter,bsc_daily_twitter,eWhispers_twitter,jimcramer_twitter,lei_zhang_lz_twitter,markminervini_twitter,muddywatersre_twitter,prrobbins_twitter,realwillmeade_twitter,script_crypto_twitter,techbudsolution_twitter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
2013-07-03,0.009619,0.0,0.0,0.025124,0.00031,0.006292,0.001897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-09,0.023785,0.011603,0.004803,0.043203,0.00065,0.004966,0.000507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-10,0.02339,0.011697,0.006112,0.044795,0.002203,0.017772,0.001679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-15,0.032136,0.02109,0.012987,0.06016,0.000688,0.009351,0.007076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07-16,0.030664,0.019128,0.012275,0.057105,0.000354,0.00435,0.002902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Prediction of Today 
* ( Between 0 & 1)

In [57]:
# Todays Data
todays_test = download_todays_test(ticker_df, df_wide, df_merge_original)
Xnew = sm.add_constant(todays_test, has_constant='add')

model = {} # Model Build For Each index fund
print(f"date: { todays_test.index.date.max() }")
output = pd.DataFrame(columns=['index', 'prediction'])
for t in ticker_df.ticker_label:
    data_with_target = create_target(df_merge.copy(), day = 5, ticker = t)
    m = linear_model(data_with_target,split=0.20,summary = False)
    y_pred = m['lm'].predict(Xnew)
    model[t] = (y_pred, m)
    output = pd.concat([output, pd.DataFrame.from_records([(t, y_pred[0])], columns=['index', 'prediction'])])
    
display(output)
    

date: 2023-01-31


Unnamed: 0,index,prediction
0,SandP_500,0.666998
0,NASDAQ,0.51818
0,RUSSEL,0.706117
0,DOW_JONES,0.650239
