In [1]:
import pandas_datareader as pdr
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from importlib import reload
from features_engineering import ma7, ma21, rsi, macd, bollinger_bands, momentum, get_tesla_headlines

from bs4 import BeautifulSoup
import requests
from nltk.sentiment.vader import SentimentIntensityAnalyzer
warnings.filterwarnings('ignore')
plt.rcParams['figure.dpi'] = 227 # native screen dpi for my computer

## Orignal Data

In [2]:
tsla_df = pdr.get_data_yahoo('tsla', '1980')
tsla_df.to_csv('data/raw_stocks.csv')

In [3]:
tsla_df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-06-29,5.0,3.508,3.8,4.778,93831500.0,4.778
2010-06-30,6.084,4.66,5.158,4.766,85935500.0,4.766
2010-07-01,5.184,4.054,5.0,4.392,41094000.0,4.392
2010-07-02,4.62,3.742,4.6,3.84,25699000.0,3.84
2010-07-06,4.0,3.166,4.0,3.222,34334500.0,3.222


In [4]:
tsla_df.describe()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close
count,2633.0,2633.0,2633.0,2633.0,2633.0,2633.0
mean,58.906934,56.294153,57.622325,57.676983,30549040.0,57.676983
std,85.129214,80.315422,82.767426,82.990392,28292880.0,82.990392
min,3.326,2.996,3.228,3.16,592500.0,3.16
25%,7.576,7.334,7.372,7.398,10674000.0,7.398
50%,45.080002,43.652,44.397999,44.306,24033500.0,44.306
75%,61.478001,59.243999,60.382,60.501999,39597000.0,60.501999
max,654.320007,618.5,653.690002,649.880005,304694000.0,649.880005


### Checking Missing Values

In [5]:
print('No Missing Values') if sum(tsla_df.isna().sum()) == 0 else tsla_df.isna().sum() 

No Missing Values


### Generating Features

In [6]:
# del stocks

files = os.listdir('data/raw_stocks')
stocks = {}
for file in files:
    name = file.lower().split('.')[0]
    stocks[name] = pd.read_csv('data/raw_stocks/'+file)  
    
    # Return Feature
    stocks[name]['Return'] = round(stocks[name]['Close'] / stocks[name]['Open'] - 1, 3)
    
    # Change of the price from previous day, absolute value
    stocks[name]['Change'] = (stocks[name].Close - stocks[name].Close.shift(1)).fillna(0)
    
    # Date Feature
    stocks[name]['Date'] = pd.to_datetime(stocks[name]['Date'])
    stocks[name].set_index('Date', inplace=True)
    
    # Volatility Feature
    stocks[name]['Volatility'] = stocks[name].Close.ewm(21).std()
    
    # Moving Average, 7 days
    stocks[name]['MA7'] = ma7(stocks[name])
    
    # Moving Average, 21 days
    stocks[name]['MA21'] = ma21(stocks[name])
    # Momentum
    stocks[name]['Momentum'] = momentum(stocks[name].Close, 3)
    
    # RSI (Relative Strength Index)
    stocks[name]['RSI'] = rsi(stocks[name])
    # MACD - (Moving Average Convergence/Divergence)
    stocks[name]['MACD'], stocks[name]['Signal'] = macd(stocks[name])
    # Upper Band and Lower Band for Bollinger Bands
    stocks[name]['Upper_band'], stocks[name]['Lower_band'] = bollinger_bands(stocks[name])
    stocks[name].dropna(inplace=True)
    # Saving
    stocks[name].to_csv('data/stocks/'+name+'.csv')
    
stocks['tsla'].head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Return,Change,Volatility,MA7,MA21,Momentum,RSI,MACD,Signal,Upper_band,Lower_band
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2010-07-28,20.549999,20.9,20.51,20.719999,20.719999,467200,0.008,0.17,1.979836,20.718571,19.911904,18.290001,41.752948,-0.350607,-0.658177,24.403824,15.419985
2010-07-29,20.77,20.879999,20.0,20.35,20.35,616000,-0.02,-0.369999,1.908327,20.725714,19.743333,17.950001,40.449222,-0.337599,-0.594062,23.858103,15.628563
2010-07-30,20.200001,20.440001,19.549999,19.940001,19.940001,426900,-0.013,-0.409999,1.839567,20.685714,19.558095,17.549999,38.996148,-0.356267,-0.546503,23.226171,15.890019
2010-08-02,20.5,20.969999,20.33,20.92,20.92,718100,0.02,0.979999,1.789358,20.674286,19.508571,17.719999,44.159747,-0.288656,-0.494933,23.066887,15.950255
2010-08-03,21.0,21.950001,20.82,21.950001,21.950001,1230500,0.045,1.030001,1.791139,20.768572,19.639524,17.35,49.041837,-0.15023,-0.425993,23.34933,15.929718


Mostly we will rely on historical data and technical indicators. Additionally, we will use news headlines of Tesla to check hypothesis if news affect price movement.

## Tesla News Headlines

For news source we will use <a href="https://www.nasdaq.com/">NASDAQ</a> website.
At the moment of parsing there were 120 pages of news from `2019-01-10` till `2019-09-05`

In [7]:
import nltk
sid = SentimentIntensityAnalyzer()

In [8]:
tsla_headline = pd.read_csv('data/tesla_headlines.csv')
tsla_headline.head()

Unnamed: 0.1,Unnamed: 0,Title,Date,Sentiment
0,0,Tesla's use of individual driver data for insu...,2019-09-05,0.0
1,1,U.S. safety agency cites Tesla Autopilot desig...,2019-09-04,0.0258
2,2,"U.S. safety agency cites driver error, Tesla A...",2019-09-04,-0.3818
3,3,"U.S. safety regulator cites driver error, Tesl...",2019-09-04,-0.3818
4,4,"U.S. NTSB cites driver error, Tesla Autopilot ...",2019-09-04,-0.6597
