# Scrape data from Investing.com

In [None]:
from selenium import webdriver
from time import sleep
import errno    
import os
import os.path
import datetime
import sys
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from functools import reduce

xpath_nextpage = '//div[@id="paginationWrap"]//div[last()]/a'
xpath_msg = '//div[contains(@class,"mainComment")]//div[contains(@class,"commentText")]//span[@class="js-text"]'
xpath_date = '//div[contains(@class,"mainComment")]//div[contains(@class,"commentBody")]//span[@class="js-date"]'

comments_list = []

def build_chrome_options():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.accept_untrusted_certs = True
    chrome_options.assume_untrusted_cert_issuer = True
    # chrome configuration
    # More: https://github.com/SeleniumHQ/docker-selenium/issues/89
    # And: https://github.com/SeleniumHQ/docker-selenium/issues/87
    chrome_options.add_argument("incognito")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1024,800")
    chrome_options.add_argument("disable-extensions")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--test-type=browser")
    chrome_options.add_argument("--disable-impl-side-painting")
    chrome_options.add_argument("--disable-setuid-sandbox")
    chrome_options.add_argument("--disable-seccomp-filter-sandbox")
    chrome_options.add_argument("--disable-breakpad")
    chrome_options.add_argument("--disable-client-side-phishing-detection")
    chrome_options.add_argument("--disable-cast")
    chrome_options.add_argument("--disable-cast-streaming-hw-encoding")
    chrome_options.add_argument("--disable-cloud-import")
    chrome_options.add_argument("--disable-popup-blocking")
    chrome_options.add_argument("--ignore-certificate-errors")
    chrome_options.add_argument("--disable-session-crashed-bubble")
    chrome_options.add_argument("--disable-ipv6")
    chrome_options.add_argument("--allow-http-screen-capture")
    return chrome_options 

def init():
    global driver
    driver = webdriver.Chrome('./chromedriver', options=build_chrome_options())

def connect(page):
    driver.get('https://www.investing.com/equities/tesla-motors-commentary/' + str(page))

def scrap():
    msgs = driver.find_elements_by_xpath(xpath_msg)
    dates = driver.find_elements_by_xpath(xpath_date)
    
    try:
        for i in range(len(msgs)):
            soup = BeautifulSoup(msgs[i].text, 'html.parser').decode("utf-8")
            time = dates[i].text
            
            comment_dict = {
                "date": time,
                "comment": soup
            }
            
            print(comment_dict)
            
            comments_list.append(comment_dict)

    finally:
        print("done scraping")


def main(argv):
    #angiv aktier man vil scrape i listen
    init()
    max_pages = 5

    page = 1
    while page < max_pages:
        connect(page)
        sleep(3)
        scrap()
        print("success")
        page += 1

    df = pd.DataFrame(comments_list)
    df.to_csv('test.csv')
    comments_list.clear()


if __name__ == "__main__":
    main(sys.argv)

# Read and format data

In [154]:
df = pd.read_csv('test.csv')

In [161]:
df.head()

Unnamed: 0,comment,date
0,It seems that no ship arrived in April to EU b...,30 minutes ago
1,250 premarket open...\n,1 hour ago
2,Clean cars. For just that worth buy . Because ...,4 hours ago
3,And liquidation of his shares would create dow...,6 hours ago
4,Tesla CEO Elon Musk is more than doubling the ...,6 hours ago


In [156]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [172]:
from datetime import datetime

def convert_date(date_string):
    
    if 'last' in date_string:
        return
    
    if 'months' in date_string:
        return
    
    if 'minutes' in date_string:
        return
    
    if 'hours' in date_string:
        return

    if 'hour' in date_string:
        return
    
    dt = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')

In [173]:
df['date'] = df['date'].apply(lambda x: convert_date(x))

ValueError: time data 'May 04, 2019 10:42PM ET' does not match format '%Y-%m-%d %H:%M:%S'

In [68]:
df.head()

Unnamed: 0,comment,date,user
0,"RBC initiates with ""sector perform"". 04/30/19\n",2019-04-30,Tpljmpr5
1,great company cash cow buying more\n,2019-04-30,Larry
2,I liked the price recovery late Friday. There ...,2019-04-28,Joe
3,"Everyone, stop making money right now!!! Zack'...",2019-04-27,Omar
4,Guys!!! What are you doing??? Everyone says ch...,2019-04-27,Omar


# Sentiment Analysis

In [69]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mathiaslund/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [70]:
sid = SentimentIntensityAnalyzer()
for comment in df.comment:
    print(comment)
    ss = sid.polarity_scores(comment)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
        
    print()
    print()
    print()

RBC initiates with "sector perform". 04/30/19

compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 


great company cash cow buying more

compound: 0.6249, neg: 0.0, neu: 0.549, pos: 0.451, 


I liked the price recovery late Friday. There is some big money that likes this company. Great cashflow. If it pushes through 119 next week, I like a big/quick move to 130 soon after. IMHO GO TXN GO! Great company.

compound: 0.9485, neg: 0.0, neu: 0.659, pos: 0.341, 


Everyone, stop making money right now!!! Zack's (super genius experts) says to sell!!!

compound: -0.5216, neg: 0.219, neu: 0.781, pos: 0.0, 


Guys!!! What are you doing??? Everyone says chips died last year, and are still dead this year!!! Stop making money and sell!!! I'm up about 30% in a short time, but what's wrong with me!!! Chips are dead !!!!!

compound: -0.9604, neg: 0.357, neu: 0.643, pos: 0.0, 


Chips and semi conductor Bubble about to pop #QCOM #NVDA #MU #TXI #AMD #INTC #SOXL

compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 



Chasers will BURN!

compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 


TXN's earnings report was horrible. They missed on revenue, and warned on their guidance. And yet, the stock is up 7%. This is comical. This is obviously short covering, because who would buy this with all the uncertainty out there?

compound: -0.8481, neg: 0.221, neu: 0.779, pos: 0.0, 


Analyst cutting price and yet stock gone up. so it's all driven by big powerful people. They decide what happen to stock price and we all puppet here.

compound: 0.3182, neg: 0.05, neu: 0.858, pos: 0.092, 


someone pumping to dump...never buy call or put options..if you cant move the stock..90 soon

compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 


BUY all stocks that miss earnings - seems to be the mantra on the street now for weeks

compound: -0.1531, neg: 0.091, neu: 0.909, pos: 0.0, 


Revenue missed = tank tomorrow.

compound: -0.296, neg: 0.423, neu: 0.577, pos: 0.0, 


wow, that report exceeded my expectations significantly!!!



In [71]:
def get_compound_score(comment):
    ss = sid.polarity_scores(comment)
    return ss['compound']

def get_sentiment(compound_score):
    if compound_score > 0.05:
        return 1
    elif compound_score > -0.05 and compound_score < 0.05:
        return 0
    else:
        return -1

In [72]:
df['compound_score'] = df['comment'].apply(lambda x: get_compound_score(x))
df['sentiment'] = df['compound_score'].apply(lambda x: get_sentiment(x))

Positive sentiment: compound >= 0.05

Neutral sentiment: compound > -0.05 < 0.05

Negative sentiment: compound < -0.05

In [73]:
df.head()

Unnamed: 0,comment,date,user,compound_score,sentiment
0,"RBC initiates with ""sector perform"". 04/30/19\n",2019-04-30,Tpljmpr5,0.0,0
1,great company cash cow buying more\n,2019-04-30,Larry,0.6249,1
2,I liked the price recovery late Friday. There ...,2019-04-28,Joe,0.9485,1
3,"Everyone, stop making money right now!!! Zack'...",2019-04-27,Omar,-0.5216,-1
4,Guys!!! What are you doing??? Everyone says ch...,2019-04-27,Omar,-0.9604,-1


# Prepare data for ML

In [129]:
new_df = pd.DataFrame([], columns=['avg_sentiment', 'avg_compound_score', 'comments', 'pos_comments', 'neg_comments'])

In [136]:
df['date'].unique

<bound method Series.unique of 0     2019-04-30
1     2019-04-30
2     2019-04-28
3     2019-04-27
4     2019-04-27
5     2019-04-25
6     2019-04-24
7     2019-04-24
8     2019-04-24
9     2019-04-24
10    2019-04-24
11    2019-04-24
12    2019-04-24
13    2019-04-24
14    2019-04-24
15    2019-04-24
16    2019-04-24
17    2019-04-24
18    2019-04-24
19    2019-04-24
20    2019-04-24
21    2019-04-24
22    2019-04-24
23    2019-04-24
24    2019-04-24
25    2019-04-24
26    2019-04-24
27    2019-04-24
28    2019-04-24
29    2019-04-24
         ...    
50          None
51          None
52          None
53          None
54          None
55          None
56          None
57          None
58          None
59          None
60          None
61          None
62          None
63          None
64          None
65          None
66          None
67          None
68          None
69          None
70          None
71          None
72          None
73          None
74          None
75          None


In [125]:
df[df['sentiment'] == -1].groupby(['date']).count()

Unnamed: 0_level_0,comment,user,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-18,1,1,1,1
2019-04-24,9,9,9,9
2019-04-27,2,2,2,2


In [106]:
new_df['df[df['sentiment'] == 1].count()

comment           28
date              12
user              28
compound_score    28
sentiment         28
dtype: int64

In [97]:
new_df = df.groupby(['date']).count()

In [98]:
new_df

Unnamed: 0_level_0,comment,user,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-12,1,1,1,1
2019-04-18,1,1,1,1
2019-04-24,33,33,33,33
2019-04-25,1,1,1,1
2019-04-27,2,2,2,2
2019-04-28,1,1,1,1
2019-04-30,2,2,2,2


In [90]:
df.groupby(['date']).mean()

Unnamed: 0_level_0,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-04-12,0.0,0.0
2019-04-18,-0.2481,-1.0
2019-04-24,-0.023673,0.030303
2019-04-25,0.0,0.0
2019-04-27,-0.741,-1.0
2019-04-28,0.9485,1.0
2019-04-30,0.31245,0.5


In [91]:
df.groupby(['date']).count()

Unnamed: 0_level_0,comment,user,compound_score,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-12,1,1,1,1
2019-04-18,1,1,1,1
2019-04-24,33,33,33,33
2019-04-25,1,1,1,1
2019-04-27,2,2,2,2
2019-04-28,1,1,1,1
2019-04-30,2,2,2,2
