In [None]:
# initial imports
import pandas as pd 
import pandas_datareader.data as web
from pandas import Series, DataFrame
from wordcloud import WordCloud
import numpy as np
import datetime as dt
import re

import matplotlib.pyplot as plt
%matplotlib inline

from warnings import filterwarnings
filterwarnings("ignore")

<img src="images/new_osemn.png" width=600>

Importing real time stock data via pandas_datareader.data as web, numerous features, the frequency of this data is daily by default, the date is automatically a pandas datetime obj and is already placed on the x axis so there's no need to perform a pivot method.
[pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/remote_data.html)

### Obtain:

In [None]:
# import data via web with this implementation there's no need to change date to datetime obj or perform a pivot  
# slicing(subsetting) just change the start and end parameters below:
start = dt.datetime(2017, 1, 1)
end = dt.datetime(2019, 1, 1)

SNP = web.DataReader('^GSPC', 'yahoo', start, end).sort_index(ascending=False) # index

In [None]:
import pandas as pd
from pandas import DataFrame
import json

tweet = open('data/condensed_2017.json')
tweets = open('data/condensed_2018.json')

tweet17 = json.load(tweet) # 2017 tweets
tweet18 = json.load(tweets) # 2018 tweets

### Scrub:
###### S&P 500

In [None]:
SNP.head()

In [None]:
# We'll use the S&P 500 Index
SNP.index

In [None]:
# only interested in the Closing price of the trading day
SNP.drop(['High', 'Low', 'Open', 'Adj Close', 'Volume'], axis= 1, inplace=True) 

In [None]:
# tail is the most current data
# this data is daily frequency
print(SNP.head())
print(SNP.tail())

In [None]:
SNP.info()

In [None]:
# initial shape based on the start and end parameters of the imports
SNP.shape

In [None]:
# annual number of trading days limited (of course)
from IPython.display import Image
Image(filename='images/trading_day.jpg')

In [None]:
SNP.describe

In [None]:
# no missing values, daily returns
SNP.isna().sum()

In [None]:
# line plot
SNP.plot(figsize = (16,6));

In [None]:
# dot plot
# this looks good very similiar to the line plot no outliers just pure data points
SNP.plot(figsize=(20,6), style= '.b');

In [None]:
# Use pandas grouper to group values using annual frequency
year_groups = SNP['Close'].groupby(pd.Grouper(freq ='A'))

# Create a new DataFrame and store yearly values in columns 
SNP_annual = pd.DataFrame()

for yr, group in year_groups:
    SNP_annual[yr.year] = group.values
    
# Plot the yearly groups as subplots
SNP_annual.plot(figsize= (13,8), subplots= True, legend=True)

In [None]:
# histogram
SNP.hist(figsize = (10,6), bins= 7)

In [None]:
# density plot
SNP.plot(kind='kde', figsize = (15,10))

In [None]:
# good for spotting outliers, not that there are any in this data
SNP_annual.boxplot(figsize = (12,7))

In [None]:
# time series heat map
year_matrix = SNP_annual.T
plt.matshow(year_matrix, interpolation=None, aspect='auto', cmap=plt.cm.Spectral_r)

##### Stationarity
The stationarity was performed much later in this process and the numbers were horrendous, upon gathering some domain knowledge on time series data namely predicting stock, this type of data is susceptible to stationarity over any trending or seasonality issues. 

In [None]:
# Create a function to check for the stationarity of a given time series using rolling stats and DF test
# Collect and package the code from previous labs
def stationarity_check(TS):
    
    # Import adfuller
    from statsmodels.tsa.stattools import adfuller
    
    # Calculate rolling statistics
    roll_mean = TS.rolling(window=8, center=False).mean()
    roll_std = TS.rolling(window=8, center=False).std()
    
    # Perform the Dickey Fuller Test
    dftest = adfuller(TS['Close'])
    
    # Plot rolling statistics:
    fig = plt.figure(figsize=(12,6))
    plt.plot(TS, color='blue',label='Original')
    plt.plot(roll_mean, color='red', label='Rolling Mean')
    plt.plot(roll_std, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    # Print Dickey-Fuller test results
    print('Results of Dickey-Fuller Test: \n')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 
                                             '#Lags Used', 'Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    
    return None

In [None]:
stationarity_check(SNP)

In [None]:
# Plot a log transform
ts_log = np.log(SNP)
fig = plt.figure(figsize=(12,6))
plt.plot(ts_log, color='blue');

In [None]:
# Plot a square root transform
ts_sqrt = np.sqrt(SNP)
fig = plt.figure(figsize=(12,6))
plt.plot(ts_sqrt, color='blue');

In [None]:
# subtracting the rolling mean
roll_mean = np.log(SNP).rolling(window=7).mean()
fig = plt.figure(figsize=(11,7))
plt.plot(np.log(SNP), color='blue', label='Original')
plt.plot(roll_mean, color='red', label='Rolling Mean')
plt.legend(loc='best')
plt.title('Log Transformed Data')
plt.show(block=False) 

In [None]:
# Subtract the moving average from the log transformed data
data_minus_roll_mean = np.log(SNP) - roll_mean
# Print the first 10 rows
data_minus_roll_mean.head(10) 

In [None]:
# Drop the missing values
data_minus_roll_mean.dropna(inplace=True)

In [None]:
fig = plt.figure(figsize=(11,7))
plt.plot(data_minus_roll_mean, color='blue',label='Close - rolling mean')
plt.legend(loc='best')
plt.show(block=False)

In [None]:
stationarity_check(data_minus_roll_mean)

In [None]:
data_minus_roll_mean.head()

In [None]:
SNP.head()

#### Presidential Tweets
##### 2017 Tweets
Here I've gather President Trumps tweet history which encompases the month he took office January 20, 2017 in the form of a couple of json files. 

In [None]:
print(len(tweet17)) # 2605 tweets

In [None]:
# json list to pandas Dataframe obj
Tweets_17_df = DataFrame(tweet17)

In [None]:
# columns check
Tweets_17_df.columns

May end up dropping source as well and just treat every tweet as presidential not 
just the ones eminating from the iPhone. Will have to clean up the text and use the created_at
as a datetime index obj, is_retweet could be used to filter out retweets later

In [None]:
# dropping unneeded columns
Tweets_17_df.drop(['id_str', 'retweet_count', 
                   'in_reply_to_user_id_str', 'favorite_count', 'source'], axis= 1, inplace= True)

In [None]:
# remove retweets(RT)
Tweets_17_df= Tweets_17_df[Tweets_17_df.is_retweet != True]

In [None]:
# dropping is_retweet column
Tweets_17_df.drop(['is_retweet'], axis= 1, inplace= True)

In [None]:
# change col created_at to Date
Tweets_17_df.rename(columns={'created_at':'Date'}, inplace=True)

In [None]:
# changing created_at to datetime index obj
# dropping the timestamp
Tweets_17_df['Date'] = pd.to_datetime(Tweets_17_df['Date']).dt.date

In [None]:
Tweets_17_df.set_index('Date', inplace=True)

In [None]:
Tweets_17_df.info()

In [None]:
# tweets are already in decending order
Tweets_17_df.head()

##### 2018 Tweets

In [None]:
print(len(tweet18)) # 3510 tweets

In [None]:
# 2019 tweets to DataFrame obj
Tweets_18_df = DataFrame(tweet18)

In [None]:
# dropping unneeded columns
Tweets_18_df.drop(['id_str', 'retweet_count', 
                   'in_reply_to_user_id_str', 'favorite_count', 'source'], axis= 1, inplace= True)

In [None]:
# remove retweets(RT)
Tweets_18_df= Tweets_18_df[Tweets_18_df.is_retweet != True]

In [None]:
# dropping is_retweet column
Tweets_18_df.drop(['is_retweet'], axis= 1, inplace= True)

In [None]:
# change col created_at to Date
Tweets_18_df.rename(columns={'created_at':'Date'}, inplace=True)

In [None]:
# changing created_at to datetime index obj
# dropping the timestamp
Tweets_18_df['Date'] = pd.to_datetime(Tweets_18_df['Date']).dt.date

In [None]:
Tweets_18_df.set_index('Date', inplace=True)

In [None]:
Tweets_18_df.info()

In [None]:
# tweets are already in decending order
Tweets_18_df.head()

### Explore:

##### Merged Tweets

In [None]:
# stacking Tweets no NaN values and other columns being created with outer merge
Tweets = pd.concat([Tweets_17_df, Tweets_18_df], axis= 0)

In [None]:
len(Tweets)

In [None]:
Tweets.head()

##### Tweets text cleanup isle 7

In [None]:
# have an idea for an interactive graph so I'll perform some text cleanup here
def txtClean(text):
    """cleaning text"""
    text = re.sub('@[A-Za-z0–9]+', '', text) 
    text = re.sub('#', '', text) 
    text = re.sub('https?:\/\/\S+', '', text)
    text = text.title() # for graphing time permitting
    text = text.lstrip() # suppose to be removing leading space in text
    
    
    return text

Tweets['text'] = Tweets['text'].apply(txtClean)

In [None]:
Tweets.head(2)

In [None]:
Twts = Tweets_18_df.head(25)

Tweets are suppose to be limited to 140 chars but many of these tweets are way over 140 chars
probably not a factor in what I'm attempting to achieve in this notebook and actually could ad
in sentiment analysis, in the creation of additional features to use in a supervised model. 

In [None]:
Tweets['length'] = [len(t) for t in Tweets.text] 

In [None]:
Tweets[Tweets.length > 140].head()

"TextBlob is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more." [TextBlob](https://textblob.readthedocs.io/en/dev/index.html)

##### Sentiment analysis


In [None]:
import nltk
from textblob import TextBlob

In [None]:
# this line doesn't like to be reduced to 80 length for some reason
tweet_example = TextBlob('the democrats have been told and fully understand that there can be no daca without the desperately needed wall at the southern border and an end to the horrible chain migration ridiculous lottery system of immigration etc we must protect our country at all cost')

In [None]:
# TextBlob seems to have manipulated the text enough to just obtain sentiment scores without additional steps
tweet_example.tags

In [None]:
tweet_example.words

This is how I will determine if a tweet is positive or negative
with multiple tweets in a given day I tally the sentiment amoung them together and just
take the average, much like a daily presidential sentiment.  

In [None]:
tweet_example.sentiment.polarity
# on a scale of 1(pos) and -1(neg)
-0.504166666666667

polarity - how positive or negative a word is -1 very neg, +1 very pos <br> 
subjectivity - how opinionated a word is 0 fact, +1 very much an opinion

In [None]:
# TextBlob test
# values are identical when lowercase and all punctuations removed.
TextBlob('the democrats have been told and fully understand that there can be no daca without the desperately needed wall at the southern border and an end to the horrible chain migration ridiculous lottery system of immigration etc we must protect our country at all cost').sentiment

###### Sentiment | Polarity 2017 tweets

In [None]:
# sentiment analysis on 2017 dataframe
polarity = lambda x: TextBlob(x).sentiment.polarity
subjectivity = lambda x: TextBlob(x).sentiment.subjectivity

Tweets['polarity'] = Tweets['text'].apply(polarity) 
Tweets['subjectivity'] = Tweets['text'].apply(subjectivity)

In [None]:
Tweets.columns

In [None]:
# dealing with multiple tweets in a single date
Tweet_analysis = Tweets.groupby('Date')['polarity', 'subjectivity'].mean()

In [None]:
Tweet_analysis.head()

In [None]:
Tweet_analysis.shape

In [None]:
# merging sentiment data with SNP data
analysis_SNP_df = Tweet_analysis.merge(data_minus_roll_mean, how='right', left_index= True, right_index=True)
analysis_SNP_df.head()

In [None]:
analysis_SNP_df.dropna(axis= 0, inplace= True)

In [None]:
analysis_SNP_df.isna().sum()

### Model:
##### Modeling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, utils
from sklearn.metrics import accuracy_score, mean_squared_error 
from sklearn.preprocessing import StandardScaler
from math import sqrt
import math

In [None]:
analysis_SNP_df.head()

In [None]:
# scale both target and features
scaler = StandardScaler()
# analysis_SNP_df = scaler.fit_transform(analysis_SNP_df)

In [None]:
analysis_SNP_df = pd.DataFrame(analysis_SNP_df)

In [None]:
analysis_SNP_df.head()

In [None]:
# Split the outcome and predictor variables
y = analysis_SNP_df['Close']
X = analysis_SNP_df.drop('Close', axis=1) 
X.head()

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state=42) 

##### RandomForestRegressor

In [None]:
# Instantiate and fit a RandomForestRegressor
# utilizing hyperparameters from exhaustive GridSearchCV
rfr = RandomForestRegressor(criterion = 'mse', 
                            max_depth = 5, 
                            max_features = 'log2', 
                            n_estimators = 100, 
                            n_jobs = -1) 
rfr.fit(X_train, y_train)

In [None]:
# Feature importance
rfr.feature_importances_

In [None]:
# subjectivity seems like a slightly better feature than polarity
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    
    
plot_feature_importances(rfr)

In [None]:
# Use the forest's predict method on the test data
predictions = rfr.predict(X_test)

In [None]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Average model error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# root mean squared error
rsme = math.sqrt(mean_squared_error(y_test, predictions))
rsme

##### Hyperparameter tuning

In [None]:
# # GridSearchCV
# gsc = GridSearchCV(estimator = RandomForestRegressor(),
# param_grid = {
#     'criterion': ('mse', 'mae'),
#     'max_depth': range(3, 7), 
#     'max_features': ('auto', 'sqrt','log2'), 
#     'n_estimators': (10 , 50, 100, 1000),
#     'n_jobs': (None, -1),
#     }, cv = 5, return_train_score= False)

# grid_result = gsc.fit(X, y)
# best_params = grid_result.best_params_

# rfr = RandomForestRegressor(criterion = best_params['criterion'],
#                            max_depth = best_params['max_depth'],
#                            max_features = best_params['max_features'],
#                            n_estimators = best_params['n_estimators'],
#                            n_jobs = best_params['n_jobs'], 
#                             random_state= False, verbose= False)


# gsc.cv_results_

In [None]:
# # results from GridSearchCV saved in pandas DataFrame obj
# df = pd.DataFrame(gsc.cv_results_)
# df.columns

In [None]:
# filtering dataframe results
#df[['param_criterion', 'param_max_depth', 'param_n_estimators', 'params', 'mean_test_score']]

In [None]:
# let's me know what methods and attributes are available for this obj
#dir(gsc)

In [None]:
# # best hyperparameters
# gsc.best_estimator_

# # RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
# #                       max_depth=5, max_features='log2', max_leaf_nodes=None,
# #                       max_samples=None, min_impurity_decrease=0.0,
# #                       min_impurity_split=None, min_samples_leaf=1,
# #                       min_samples_split=2, min_weight_fraction_leaf=0.0,
# #                       n_estimators=100, n_jobs=-1, oob_score=False,
# #                       random_state=None, verbose=0, warm_start=False)

In [None]:
# # best score
# gsc.best_score_

# #-14.72446175347468

In [None]:
# # best parameters
# gsc.best_params_

# # {'criterion': 'mse',
# #  'max_depth': 5,
# #  'max_features': 'log2',
# #  'n_estimators': 100,
# #  'n_jobs': -1}

##### Tweet Word Cloud

In [None]:
words = ' '.join([tweet for tweet in Twts['text']])
wordCloud = WordCloud(width=800, height=600, random_state= 21, max_font_size= 120).generate(words)

plt.imshow(wordCloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Interpret: