In [None]:
# initial imports
import pandas as pd 
import pandas_datareader.data as web
from pandas import Series, DataFrame
from wordcloud import WordCloud
import numpy as np
import datetime as dt
import re

import matplotlib.pyplot as plt
%matplotlib inline

from warnings import filterwarnings
filterwarnings("ignore")

>Importing real time stock data via pandas_datareader.data as web
>[pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/remote_data.html)

In [None]:
# import data via web with this implementation there's no need to change date to datetime obj or perform a pivot  
# slicing(subsetting) just change the start and end parameters below:
start = dt.datetime(2017, 1, 1)
end = dt.datetime(2019, 1, 1)

SNP = web.DataReader('^GSPC', 'yahoo', start, end).sort_index(ascending=False) # index

###### S&P 500

In [None]:
# We'll use the S&P 500 Index
SNP.index

In [None]:
# only interested in the Closing price of the trading day
SNP.drop(['High', 'Low', 'Open', 'Adj Close', 'Volume'], axis= 1, inplace=True) 

In [None]:
# tail is the most current data
# this data is daily frequency
print(SNP.head())
print(SNP.tail())

In [None]:
SNP.info()

In [None]:
# initial shape based on the start and end parameters of the imports
SNP.shape

In [None]:
# annual number of trading days limited (of course)
from IPython.display import Image
Image(filename='images/trading_day.jpg')

In [None]:
SNP.describe

In [None]:
# no missing values, daily returns
SNP.isna().sum()

In [None]:
# line plot
SNP.plot(figsize = (16,6));

In [None]:
# dot plot
# this looks good very similiar to the line plot no outliers just pure data points
SNP.plot(figsize=(20,6), style= '.b');

In [None]:
# Use pandas grouper to group values using annual frequency
year_groups = SNP['Close'].groupby(pd.Grouper(freq ='A'))

# Create a new DataFrame and store yearly values in columns 
SNP_annual = pd.DataFrame()

for yr, group in year_groups:
    SNP_annual[yr.year] = group.values
    
# Plot the yearly groups as subplots
SNP_annual.plot(figsize= (13,8), subplots= True, legend=True)

In [None]:
# histogram
SNP.hist(figsize = (10,6), bins= 7)

In [None]:
# density plot
SNP.plot(kind='kde', figsize = (15,10))

In [None]:
# good for spotting outliers, not that there are any in this data
SNP_annual.boxplot(figsize = (12,7))

In [None]:
# time series heat map
year_matrix = SNP_annual.T
plt.matshow(year_matrix, interpolation=None, aspect='auto', cmap=plt.cm.Spectral_r)

In [None]:
##### Stationarity


##### Modeling

##### Presidential Tweets

In [None]:
import pandas as pd
from pandas import DataFrame
import json

tweet = open('data/condensed_2017.json')
tweets = open('data/condensed_2018.json')

tweet17 = json.load(tweet) # 2017 tweets
tweet18 = json.load(tweets) # 2018 tweets

##### 2017 Tweets

In [None]:
print(len(tweet17)) # 2605 tweets

In [None]:
# json list to pandas Dataframe obj
Tweets_17_df = DataFrame(tweet17)

In [None]:
# columns check
Tweets_17_df.columns

>May end up dropping source as well and just treat every tweet as presidential not 
>just the ones eminating from the iPhone. Will have to clean up the text and use the created_at
>as a datetime index obj, is_retweet could be used to filter out retweets later

In [None]:
# dropping unneeded columns
Tweets_17_df.drop(['id_str', 'retweet_count', 
                   'in_reply_to_user_id_str', 'favorite_count', 'source'], axis= 1, inplace= True)

In [None]:
# remove retweets(RT)
Tweets_17_df= Tweets_17_df[Tweets_17_df.is_retweet != True]

In [None]:
# dropping is_retweet column
Tweets_17_df.drop(['is_retweet'], axis= 1, inplace= True)

In [None]:
# change col created_at to Date
Tweets_17_df.rename(columns={'created_at':'Date'}, inplace=True)

In [None]:
# changing created_at to datetime index obj
# dropping the timestamp
Tweets_17_df['Date'] = pd.to_datetime(Tweets_17_df['Date']).dt.date

In [None]:
Tweets_17_df.set_index('Date', inplace=True)

In [None]:
Tweets_17_df.info()

In [None]:
# tweets are already in decending order
Tweets_17_df.head()

##### 2018 Tweets

In [None]:
print(len(tweet18)) # 3510 tweets

In [None]:
# 2019 tweets to DataFrame obj
Tweets_18_df = DataFrame(tweet18)

In [None]:
# dropping unneeded columns
Tweets_18_df.drop(['id_str', 'retweet_count', 
                   'in_reply_to_user_id_str', 'favorite_count', 'source'], axis= 1, inplace= True)

In [None]:
# remove retweets(RT)
Tweets_18_df= Tweets_18_df[Tweets_18_df.is_retweet != True]

In [None]:
# dropping is_retweet column
Tweets_18_df.drop(['is_retweet'], axis= 1, inplace= True)

In [None]:
# change col created_at to Date
Tweets_18_df.rename(columns={'created_at':'Date'}, inplace=True)

In [None]:
# changing created_at to datetime index obj
# dropping the timestamp
Tweets_18_df['Date'] = pd.to_datetime(Tweets_18_df['Date']).dt.date

In [None]:
Tweets_18_df.set_index('Date', inplace=True)

In [None]:
Tweets_18_df.info()

In [None]:
# tweets are already in decending order
Tweets_18_df.head()

##### Merged Tweets

In [None]:
# stacking Tweets no NaN values and other columns being created with outer merge
Tweets = pd.concat([Tweets_17_df, Tweets_18_df], axis= 0)

In [None]:
len(Tweets)

In [None]:
Tweets.head()

##### Tweets text cleanup isle 7

In [None]:
# have an idea for an interactive graph so I'll perform some text cleanup here
def txtClean(text):
    """cleaning text"""
    text = re.sub('@[A-Za-z0–9]+', '', text) 
    text = re.sub('#', '', text) 
    text = re.sub('https?:\/\/\S+', '', text)
    text = text.title() # for graphing time permitting
    text = text.lstrip() # suppose to be removing leading space in text
    
    
    return text

Tweets['text'] = Tweets['text'].apply(txtClean)

In [None]:
Tweets.head(2)

>Tweets are suppose to be limited to 140 chars but many of these tweets are way over 140 chars
>probably not a factor in what I'm attempting to achieve in this notebook and actually could ad
>in sentiment analysis, in the creation of another features to use in a supervised model. 

In [None]:
Tweets['length'] = [len(t) for t in Tweets.text] 

In [None]:
Tweets[Tweets.length > 140].head()

>TextBlob is the bomb! Is a knowledge based (bag of words) NLP system created by linguist. Will utilize the library to streamline the process. 
>**TextBlob: Simplified Text Processing** [textblob](https://textblob.readthedocs.io/en/dev/index.html)

##### Sentiment analysis

In [None]:
import nltk
from textblob import TextBlob

In [None]:
# this line doesn't like to be reduced to 80 length for some reason
tweet_example = TextBlob('the democrats have been told and fully understand that there can be no daca without the desperately needed wall at the southern border and an end to the horrible chain migration ridiculous lottery system of immigration etc we must protect our country at all cost')

In [None]:
# TextBlob seems to have manipulated the text enough to just obtain sentiment scores without additional steps
tweet_example.tags

In [None]:
tweet_example.words

This is how I will determine if a tweet is positive or negative
with multiple tweets in a given day I may tally the sentiment amoung them and just
take the average, daily_presidential_sentiment.  

In [None]:
tweet_example.sentiment.polarity
# on a scale of 1(pos) and -1(neg)
-0.504166666666667

>**polarity - how positive or negative a word is -1 very neg, +1 very pos** <br> 
>**subjectivity - how opinionated a word is 0 fact, +1 very much an opinion**

In [None]:
# TextBlob test
# values are identical when lowercase and all punctuations removed.
TextBlob('the democrats have been told and fully understand that there can be no daca without the desperately needed wall at the southern border and an end to the horrible chain migration ridiculous lottery system of immigration etc we must protect our country at all cost').sentiment

###### Sentiment | Polarity 2017 tweets

In [None]:
# sentiment analysis on 2017 dataframe
polarity = lambda x: TextBlob(x).sentiment.polarity
subjectivity = lambda x: TextBlob(x).sentiment.subjectivity

Tweets['polarity'] = Tweets['text'].apply(polarity) 
Tweets['subjectivity'] = Tweets['text'].apply(subjectivity)

In [None]:
Tweets.columns

In [None]:
# dealing with multiple tweets in a single date
Tweet_analysis = Tweets.groupby('Date')['polarity', 'subjectivity'].mean()

In [None]:
Tweet_analysis.head()

In [None]:
Tweet_analysis.shape

In [None]:
# merging sentiment data with SNP data
analysis_SNP_df = Tweet_analysis.merge(SNP, how='right', left_index= True, right_index=True)
analysis_SNP_df.head()

In [None]:
analysis_SNP_df.dropna(axis= 0, inplace= True)

In [None]:
analysis_SNP_df.isna().sum()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# scaling Close column
scaler = StandardScaler()
scaled_close = scaler.fit_transform(analysis_SNP_df.Close.values.reshape(-1, 1))
scaled_close

In [None]:
plt.plot(scaled_close)

##### Modeling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import BaggingClassifier, RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, utils

In [None]:
# Split the outcome and predictor variables
target = analysis_SNP_df['Close']
data = analysis_SNP_df.drop("Close", axis=1)

In [None]:
# train test split
data_train, data_test, target_train, target_test = train_test_split(data, target, 
                                                                    test_size = 0.20, random_state=42)

In [None]:
# ValueError: Unknown label type: 'continuous'
# this is continous not classification 
# forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
# forest.fit(data_train, target_train)

In [None]:
# forest = RandomForestRegressor()
# forest.fit(data_train, target_train)

In [None]:
# instantiate RandomForestRegressor
rf_Reg = RandomForestRegressor()
# mean_rf_reg_score = cross_val_score(rf_Reg, data_train, target_train, cv=3)
# print(f"Mean Cross Validation Score for Random Forest Classifier: {mean_rf_reg_score :.2%}") 

In [None]:
rf_param_grid = {
    'n_estimators': [10, 30, 50, 100],# trees in the forest
    'criterion': ['mae', 'mse'], # mean abs error, mean square error, 
    'max_depth': [None], # None nodes are expanded until all leaves are pure ...
    'random_state': 42,
    'max_features': ['auto', 'sqrt', 'log2'],
    'n_jobs': -1,# utilize all processors
    'max_leaf_nodes': [None, 5, 10, 15, 25],
}

In [None]:
# instantiate GridSearchCV
# gs_CV = GridSearchCV(rf_Reg, rf_param_grid)

In [None]:
rf_grid_search = GridSearchCV(rf_Reg, rf_param_grid, cv=3)
rf_grid_search.fit(data_train, target_train)

print(f"Training Accuracy: {rf_grid_search.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

##### Tweet Word Cloud

In [None]:
words = ' '.join([tweet for tweet in Tweets['text']])
wordCloud = WordCloud(width=800, height=600, random_state= 21, max_font_size= 120).generate(words)

plt.imshow(wordCloud, interpolation='bilinear')
plt.axis('off')
plt.show()