# Predicting Stock Movement Using Market Sentiments

## Questions:
### (1) Based on stock price data can you predict whether tomorrow's Closing Price is greater/less than today's closing price?
### (2) What influence does social media sentiment data have on prediction?
### (3) How long in the past is data relevant?

## Stock Chosen: Nike

### (1) Clothing Industry
### (2) Social Media Presence

#### A notebook by Seema, Thanusan, Adit & Sean

### Import Libraries

In [1]:
import quandl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Dense, LSTM
from keras.models import Sequential
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Import Sentiments Data and Feature Engineering

#### News Sentiment

In [2]:
quandl.ApiConfig.api_key = '-fujnELuC1_ZZSKPzix8'

news_sentiments_raw = quandl.get_table('IFT/NSA', ticker='NKE')
mask = news_sentiments_raw["exchange_cd"] == "US"

news_sentiments = news_sentiments_raw[mask]
news_sentiments = news_sentiments.set_index('date')

#Getting rid of irrelevant columns
news_sentiments = news_sentiments.drop(['exchange_cd', 'name'], axis=1)

In [3]:
no_rows_news = news_sentiments.shape[0]
no_columns_news = news_sentiments.shape[1]
print('The dimension of the Nike News Sentiments DataFrame is %s by %s' % (no_rows_news, no_columns_news))

The dimension of the Nike News Sentiments DataFrame is 2119 by 6


In [4]:
news_sentiments.head(5)

Unnamed: 0_level_0,ticker,sentiment,sentiment_high,sentiment_low,news_volume,news_buzz
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-07-20,NKE,1.444,3.899,-2.01,9,10.0
2018-07-21,NKE,2.0,2.0,2.0,1,6.0
2018-07-22,NKE,2.0,3.0,1.0,2,6.0
2018-07-23,NKE,3.333,3.805,2.862,3,10.0
2018-07-24,NKE,3.333,4.581,2.086,3,5.0


#### Facebook Sentiment

In [5]:
facebook_sentiment = quandl.get_table('SMA/FBUP', brand_ticker='NKE', paginate=True)
facebook_sentiment = facebook_sentiment.set_index('date')

cols_to_drop = ['brand_ticker', 'page_id', 'post_id', 'sector', 'geography', 'type', 'created_time', 'response_time', 'likes', 'comments', 'shares']

# Getting rid of irrelevant columns
facebook_sentiment = facebook_sentiment.drop(cols_to_drop, axis=1)
facebook_sentiment.columns = ['FB_sentiment']

no_rows_FB = facebook_sentiment.shape[0]
no_columns_FB = facebook_sentiment.shape[1]
print('The dimension of the Nike News Sentiments DataFrame is %s by %s' % (no_rows_FB, no_columns_FB))

The dimension of the Nike News Sentiments DataFrame is 126010 by 1


In [6]:
facebook_sentiment.sort_index(inplace = True)
facebook_sentiment.head(5)

Unnamed: 0_level_0,FB_sentiment
date,Unnamed: 1_level_1
2014-01-01,NEUTRAL
2014-01-01,NEUTRAL
2014-01-01,NEUTRAL
2014-01-01,NEUTRAL
2014-01-01,NEUTRAL


#### Instagram Sentiment

In [7]:
insta_sentiment = quandl.get_table('SMA/INSD', brand_ticker='NKE', paginate=True)

nike_mask = insta_sentiment['ins_account_name'] == 'nike'

insta_sentiment = insta_sentiment[nike_mask]

insta_sentiment = insta_sentiment.set_index('date')
insta_sentiment.sort_index(inplace = True)

In [8]:
insta_sentiment.head(n=20)

Unnamed: 0_level_0,brand_ticker,ins_account_name,brand_name,sector,geography,followers_count,followees_count,posts_count,likes_count,comments_count,total_posts_count,engagement_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-01-01,NKE,nike,nike,Retail,Worldwide,3055602,69,1,197190,840,648,1000
2014-01-02,NKE,nike,nike,Retail,Worldwide,3061505,69,1,199283,827,648,1000
2014-01-03,NKE,nike,nike,Retail,Worldwide,3070820,70,0,0,0,649,0
2014-01-04,NKE,nike,nike,Retail,Worldwide,3078737,70,1,151875,412,649,999
2014-01-05,NKE,nike,nike,Retail,Worldwide,3086889,70,0,0,0,650,0
2014-01-06,NKE,nike,nike,Retail,Worldwide,3094587,70,2,299853,1491,650,998
2014-01-07,NKE,nike,nike,Retail,Worldwide,3105772,70,0,0,0,652,0
2014-01-08,NKE,nike,nike,Retail,Worldwide,3113450,71,0,0,0,652,0
2014-01-09,NKE,nike,nike,Retail,Worldwide,3121689,71,1,185530,894,652,1000
2014-01-10,NKE,nike,nike,Retail,Worldwide,3132240,71,1,169611,749,653,999


Creating two new metrics to measure market sentiments towards Nike:

In [9]:
insta_sentiment['follower_growth'] = insta_sentiment['followers_count'].diff()

In [10]:
# Getting rid of irrelevant columns
more_cols_to_drop = ['brand_ticker', 'ins_account_name', 'brand_name', 'sector', 'geography', 'followees_count', 'posts_count', 'likes_count', 'comments_count', 'total_posts_count', 'engagement_score', 'followers_count']
insta_sentiment = insta_sentiment.drop(more_cols_to_drop, axis=1)
insta_sentiment.columns = ['insta_follower_growth']

In [11]:
insta_sentiment.head(5)

Unnamed: 0_level_0,insta_follower_growth
date,Unnamed: 1_level_1
2014-01-01,
2014-01-02,5903.0
2014-01-03,9315.0
2014-01-04,7917.0
2014-01-05,8152.0


## Importing Stock Price Data

In [12]:
company_stock = quandl.get("WIKI/NKE")

In [13]:
no_rows = company_stock.shape[0]
no_columns = company_stock.shape[1]
print('The dimension of the Stock Price DataFrame is %s by %s' % (no_rows, no_columns))

The dimension of the Stock Price DataFrame is 9410 by 12


In [14]:
# Getting rid of irrelevant columns
even_more_cols_to_drop = ['Open', 'High', 'Low', 'Close', 'Volume', 'Ex-Dividend', 'Split Ratio',
       'Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Volume']
company_stock = company_stock.drop(even_more_cols_to_drop, axis=1)
company_stock['Adj_Close_Change'] = company_stock['Adj. Close'].diff()
company_stock.columns = ['stock_price', 'stock_price_change']

In [15]:
company_stock.head(5)

Unnamed: 0_level_0,stock_price,stock_price_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-12-02,0.061535,
1980-12-03,0.060198,-0.001338
1980-12-04,0.062178,0.00198
1980-12-05,0.05886,-0.003318
1980-12-08,0.055542,-0.003318


In [16]:
company_stock.sort_index(inplace = True)
news_sentiments.sort_index(inplace = True)
facebook_sentiment.sort_index(inplace = True)
insta_sentiment.sort_index(inplace = True)

In [17]:
insta_sentiment.head(n=10)

Unnamed: 0_level_0,insta_follower_growth
date,Unnamed: 1_level_1
2014-01-01,
2014-01-02,5903.0
2014-01-03,9315.0
2014-01-04,7917.0
2014-01-05,8152.0
2014-01-06,7698.0
2014-01-07,11185.0
2014-01-08,7678.0
2014-01-09,8239.0
2014-01-10,10551.0


## Merging the Datasets Together

#### Note: Dataframes are merged by date

In [18]:
merge_1 = company_stock.join(news_sentiments, how = "left")

In [19]:
merge_1.tail()

Unnamed: 0_level_0,stock_price,stock_price_change,ticker,sentiment,sentiment_high,sentiment_low,news_volume,news_buzz
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-03-21,66.35,-0.45,NKE,1.794,3.245,0.343,34.0,10.0
2018-03-22,64.42,-1.93,NKE,0.286,2.833,-3.262,56.0,10.0
2018-03-23,64.63,0.21,NKE,0.255,2.423,-2.913,94.0,10.0
2018-03-26,65.9,1.27,NKE,1.313,3.541,-1.916,32.0,7.0
2018-03-27,66.17,0.27,NKE,1.357,3.882,-2.167,14.0,1.0


In [20]:
#merge_2 = merge_1.join(facebook_sentiment, how = "left")
merge_3 = merge_1.join(insta_sentiment, how = "left")
merge_3['sentiment_growth'] = merge_3['sentiment'].diff()

(9410, 8)


NameError: name 'merge_2' is not defined

In [21]:
merge_3.dropna(axis = 0, inplace = True)

In [22]:
merge_3.head(n = 10)

Unnamed: 0_level_0,stock_price,stock_price_change,ticker,sentiment,sentiment_high,sentiment_low,news_volume,news_buzz,insta_follower_growth,sentiment_growth
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-01-02,37.419008,-0.191304,NKE,0.619,3.029,-2.791,42.0,10.0,5903.0,-0.488
2014-01-03,37.318574,-0.100434,NKE,0.261,2.723,-3.201,46.0,10.0,9315.0,-0.358
2014-01-06,37.031618,-0.286956,NKE,1.516,3.465,-1.433,31.0,10.0,7698.0,1.255
2014-01-07,37.060314,0.028696,NKE,1.313,3.052,-1.427,32.0,6.0,11185.0,-0.203
2014-01-08,36.86901,-0.191304,NKE,0.22,2.51,-3.071,41.0,8.0,7678.0,-1.093
2014-01-09,36.86901,0.0,NKE,1.308,4.027,-2.411,39.0,8.0,8239.0,1.088
2014-01-10,36.787706,-0.081304,NKE,0.588,2.991,-2.814,51.0,10.0,10551.0,-0.72
2014-01-13,35.955535,-0.832171,NKE,0.654,2.554,-2.246,26.0,10.0,8848.0,0.066
2014-01-14,35.950752,-0.004783,NKE,0.457,3.079,-3.165,35.0,6.0,10965.0,-0.197
2014-01-15,36.075099,0.124347,NKE,1.846,3.878,-1.186,26.0,5.0,10943.0,1.389


### Modelling

In [23]:
data = merge_3.drop(["ticker"], axis = 1)

price = data["stock_price"]

# Length Hyperparameters
l_check = [5,20,120]

accuracies = []

# Choose Number of Features
cases = [1,8,9]

case = cases[2]

data = np.array(data.iloc[:,:case]) 

# Iterate Through Each Length
for j in range(len(l_check)):

    length = l_check[j]
    
    labels = np.zeros(((len(price) - length),1))

    count = 0

    for i in range((length-1),(len(price)-1)):
        if price[(i+1)] - price[(i)] > 0:
            labels[count] = 1
        else:
            labels[count] = 0

        count = count + 1   
       
    features = np.zeros((len(labels),length,case))
    
    for i in range(len(features)):

        features[i,:,:] = data[i:(i+length),:]

    arr = np.arange(0,int(features.shape[0]*0.7))
    arr2 = np.arange(int(features.shape[0]*0.7),features.shape[0])

    train_X = features[arr,:,:]
    train_Y = labels[arr,:]

    test_X = features[arr2,:,:]
    test_Y = labels[arr2,:]

    model = Sequential()
    model.add(LSTM(100, input_shape = (features.shape[1],case)))
    
    model.add(Dense(1, activation = "sigmoid"))

    opt = Adam(lr=0.02, beta_1=0.9, beta_2=0.999, decay=0.01)


    model.compile(loss='binary_crossentropy',
                optimizer=opt,
                metrics=['accuracy'])

    model.fit(train_X, train_Y, epochs = 15)

    pred = model.predict(test_X)

    preds = np.zeros((len(pred),1))

    for i in range(len(pred)):
        if pred[i] > 0.5:
            preds[i] = 1
        else:
            preds[i] = 0

    accuracy = np.mean(preds == test_Y)
    
    accuracies.append(accuracy)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [24]:
accuracies

[0.46540880503144655, 0.535031847133758, 0.5633802816901409]

## Accuracy Results

In [28]:
Week = [0.525, 0.525, 0.465]
Month = [0.465,0.497,0.528]
Month_6 = [0.465,0.535,0.563]


df = pd.DataFrame([Price, Price_Sentiment, Price_Sentiment_Growth])

df.columns = ["Week", "Month", "6-Month"]

df.index = ["Price", "Price_Sentiment", "Price_Sentiment_Growth"]

df.head()

Unnamed: 0,Week,Month,6-Month
Price,0.525,0.525,0.465
Price_Sentiment,0.465,0.497,0.528
Price_Sentiment_Growth,0.465,0.535,0.563
