In [1]:
import os
from bs4 import BeautifulSoup
import sys
import json
import re
import time
import csv
import datetime

from getStringWithOnlyUtf8Symbols import getStringWithOnlyUtf8Symbols
from removeDuplicatesFromList import removeDuplicatesFromList

from makeDirectoryIfNotExist import makeDirectoryIfNotExist

from supervisedRequests.supervisedRequests import SupervisedRequests

In [2]:
supervisedRequests = SupervisedRequests()

In [3]:
def get_tweet_text(tweet):
    tweet_text_box = tweet.find("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text", "lang" : "en"})
    if tweet_text_box is None:
        tweet_text_box = tweet.find("p", {"lang" : "en"})
    if tweet_text_box is None:
        return None
    images_in_tweet_tag = tweet_text_box.find_all("a", {"class": "twitter-timeline-link u-hidden"})
    tweet_text = tweet_text_box.text
    for image_in_tweet_tag in images_in_tweet_tag:
        tweet_text = tweet_text.replace(image_in_tweet_tag.text, '')

    return tweet_text

def getTimestamp(content):
    #headerBox = content.find("div", {"class" : "stream-item-header"})
    #timeBox = headerBox.find("small", {"class" : "time"})
    timestamp = content.find("span" , {"data-time" : re.compile(r".*")})["data-time"]
    return timestamp

def getUserId(content):
    userData = content.find("a", {"class" : "account-group js-account-group js-action-profile js-user-profile-link js-nav"})
    return userData["data-user-id"]

def getUserName(content):
    userData = content.find("a", {"class" : "account-group js-account-group js-action-profile js-user-profile-link js-nav"})
    return userData["href"][1:]

def getTweetData(tweetData):
    tweetData = tweetData.find("li", {"data-item-id" : re.compile(r".*"), "data-item-type" : "tweet"})
    statusId = tweetData["data-item-id"]
    content = tweetData.find("div", {"class" : "content"})
    if content is None:
        print("Could not find content class")
        return None
    text = get_tweet_text(content)
    timestamp = getTimestamp(content)
    userId = getUserId(content)
    userName = getUserName(content)
    res = {"statusId" : statusId, "timestamp" : timestamp, "userId" : userId, "userName" : userName, "text" : text}
    return res
    
    
def get_this_page_tweets(soup):
    tweets_list = list()
    tweets = soup.find_all("li", {"data-item-type": "tweet"})
    for tweet in tweets:
        tweet_data = None
        try:
            tweet_data = get_tweet_text(tweet)
        except Exception as e:
            continue
            #ignore if there is any loading or tweet error

        if tweet_data:
            tweets_list.append(tweet_data)
            print(".", end="")
            sys.stdout.flush()

    return tweets_list

def get_this_page_replies(soup):
    replies = []
    elements = soup.findAll(None, {"class" : "js-tweet-text-container"})
    for el in elements:
        text = get_tweet_text(el)
        if text is not None:
            replies.append(getStringWithOnlyUtf8Symbols(text))
    return replies

def get_this_page_replies_data(soup):
    classType = "ThreadedConversation-tweet"
    replies = []
    elements = soup.findAll(None, {"class" : classType})
    for el in elements:
        data = getTweetData(el)
        if data is not None:
            replies.append(data)
    return replies

def getStatusIdsAndScreenName(soup):
    elementsWithDataConversationId = soup.findAll("div", {"data-conversation-id" : re.compile(r".*"), "data-screen-name" : re.compile(r".*")})
    result = []
    for el in elementsWithDataConversationId:
        if el not in result:
            result.append({"status" : el["data-conversation-id"], "name" : el["data-screen-name"]})
    return result
    
def get_statuses_ids(soup):
    elementsWithDataConversationId = soup.findAll("div", {"data-conversation-id" : re.compile(r".*")})
    result = []
    for el in elementsWithDataConversationId:
        if el not in result:
            result.append(el["data-conversation-id"])
        
    return result

def scrap_replies_of_status(username, statusId):
    replies = []
    url = "https://twitter.com/" + username + "/status/" + statusId
    response = supervisedRequests.get(url)
    if response is None:
        return replies
    
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    
    next_pointer = None
    try:
        divOfStreamContainerForComments = soup.find("div", {"class" : "ThreadedDescendants", "id" : "descendants"})
        next_pointer = divOfStreamContainerForComments.find("div", {"class": "stream-container"})["data-min-position"]
    except Exception as e:
        print(e)
        return replies
    
    while True:
        replies.extend(get_this_page_replies_data(soup))
        
        next_url = "https://twitter.com/i/" + username + \
                   "/conversation/" + statusId + \
                   "?include_available_features=1&" \
                   "include_entities=1&max_position=" + next_pointer + "&reset_error_state=false"
                    
        next_response = supervisedRequests.get(next_url)
        if next_response is None:
            print("url: " + url)
            print("next url: " + next_url)
            return replies
        
        replies_data = next_response.text
        replies_obj = None
        try:
            replies_obj = json.loads(replies_data)
        except:
            break
        
        if not replies_obj["has_more_items"] and not replies_obj["min_position"]:
            # using two checks here bcz in one case has_more_items was false but there were more items
            break
            
        next_pointer = replies_obj["min_position"]
        html = replies_obj["items_html"]
        soup = BeautifulSoup(html, 'lxml')
    
    return replies
    

def get_replies_data(username, pagesAmount=None):
    replies = []
    
    url = "http://www.twitter.com/" + username
    print("\n\nDownloading tweets for " + username)
    response = supervisedRequests.get(url)
    if response is None:
        return replies

    soup = BeautifulSoup(response.text, 'lxml')
 
    next_pointer = None
    try:
        next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]
    except Exception as e:
        print(e)
        return replies
 
    i = 0
    while True:
        #statusIds = get_statuses_ids(soup)
        statusAndName = getStatusIdsAndScreenName(soup)
        for sn in statusAndName:
            #replies.extend(scrap_replies_of_status(username, id_))
            replies.extend(scrap_replies_of_status(sn["name"], sn["status"]))
        
        next_url = "https://twitter.com/i/profiles/show/" + username + \
                   "/timeline/tweets?include_available_features=1&" \
                   "include_entities=1&max_position=" + next_pointer + "&reset_error_state=false"

        next_response = supervisedRequests.get(next_url)
        if next_response is None:
            return replies
 
        tweets_data = next_response.text
        tweets_obj = None
        try:
            tweets_obj = json.loads(tweets_data)
        except:
            break
            
        if not tweets_obj["has_more_items"] and not tweets_obj["min_position"]:
            # using two checks here bcz in one case has_more_items was false but there were more items
            print("\nNo more tweets returned")
            break
        next_pointer = tweets_obj["min_position"]
        html = tweets_obj["items_html"]
        soup = BeautifulSoup(html, 'lxml')
        
        print("Next Page")
        i += 1
        
        if pagesAmount is not None and i >= pagesAmount:
            return replies
 
    return replies

In [81]:
account_list = None
with open("stockTwitters", "r") as fp:
    account_list = fp.read().split("\n")[:-1]

In [None]:
csvFile = "twitterCommentsWithServiceInfo.csv"
directory = "twitterCommentsWithServiceInfo"
makeDirectoryIfNotExist(directory)
writer = csv.writer(fp)
idx = 0
for stock in account_list:
    commentData = get_replies_data(stock)
    with open(os.path.join(directory, stock + ".json"), "w") as fp:
        json.dump(commentData, fp)
    with open(csvFile, "a") as fp:
        writer = csv.writer(fp)
        for cd in commentData:
            if cd["text"] == None:
                continue
            dateTime = datetime.datetime.fromtimestamp(int(cd["timestamp"])).strftime("%d-%m-%Y %H:%M:%S")
            link = "https://twitter.com/" + stock + "/status/" + str(cd["statusId"])
            record = [idx, dateTime, cd["timestamp"], cd["userId"], cd["userName"], cd["text"], link, stock]
            writer.writerow(record)
            idx += 1
    del commentData



Downloading tweets for OKEx_
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page

No more tweets returned


Downloading tweets for HuobiGlobal
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
response.status_code == 404
url: https://twitter.com/ichbinrobotech/status/997846210982567936
next url: https://twitter.com/i/ichbinrobotech/conversation/997846210982567936?include_availa

Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page

No more tweets returned


Downloading tweets for digifinex
Next Page
response.status_code == 404
response.status_code == 503
url: https://twitter.com/digifinex/status/1014427028752293888
next url: https://twitter.com/i/digifinex/conversation/1014427028752293888?include_available_features=1&include_entities=1&max_position=DAACDwABCgAAAB0OFDGOaJXgBQ4UBD4UlTABDhP9vAjWkAAOMTcCfxbQAA4kYoGjlQAADicL1MAVAAAOFAJRcNTAAA4UAQ-ZV-ABDhQAH52UEAAOE_7iWldQAA4T_kXllTAADhQI2jIUwAAOF18Q3dTwAQ4UPGb_FPABDjFP5udXgAAOLUC3jpRgAA4nCnqA1TAADhzZfxzV4AEOGygKaRTQAA4bD9Z9lBAADhiey3lU8AAOGJ5-hZfgAA4YQoiXFQAADhgIkWKVAAAOF-vBzhawAA4X633zV1ACDhfY31GXcAAOF7jOOZaQAQ4Xi9nx1tACCAADAAAAAQIABAAAAA&reset_error_state=false
response.status_code == 404
Next Page
Next Page

No more tweets returned


Downloading tweets for bitinka
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Next Page
Nex