In [1]:
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.common.keys import Keys
from time import sleep
import sys
import re
from collections import defaultdict
import pandas as pd
import datetime as dt
from datetime import timedelta
from hashlib import sha256
import os.path
from src import DATA_DIR
from hazm import Normalizer, sent_tokenize, word_tokenize
import plotly.express as px
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [22]:
class Twitter_scraper:
    def __init__(self, driver):
        self.driver = driver
        

    def save_file(self, list_, exprot_file_name):
        df = pd.DataFrame(list_, columns=['Tweet_ID', 'Username',
                                          'User_ID', 'DateTime',
                                          'Text', 'Reply_to',
                                          'Hashtag', 'Reply',
                                          'Retweet', 'Like',
                                         ])

        if not os.path.isfile(DATA_DIR / f'data/{exprot_file_name}.csv'):
            df.to_csv(DATA_DIR / f'data/{exprot_file_name}.csv', encoding="utf-8-sig", index=False)

        else:
            df.to_csv(DATA_DIR / f'data/{exprot_file_name}.csv', encoding="utf-8-sig", mode='a', header=False, index=False)
            
            
    
    
    def create_search_query(self, search_query:dict):
        
        '''
        *** Create Search Query ***
        :param search_query: (dict or path) Gets search query. If search_querys type is dictionary, It can have following keys:
            - include: (list) Words that must be in tweet text. Each element is a list. This method will combine different elements with 'AND'
              operator and each element words with 'OR' operator.
              
            - not_include: (list) Words that shouldn't be in tweet text. Each element is a list. This method will combine different elements
              with 'AND' operator and each element words with 'OR' operator.

            - from: (list) Filters tweets that sent from specific Twitter accounts

            - to: (list) Filters tweets that sent in reply to specific Twitter accounts

            - since: (str, samp = “2015-12-21”) Containing tweets that sent since a specific date

            - until: (str, samp = “2022-12-21”) Containing tweets that sent until a specific date

            - mentioning: (list) Containing tweets that mentioning specific Twitter accounts

            - question: ('True', 'False') If True, gets all tweet, that were recognized by twitter AI as a question
                
        '''

        search_text = ''
        _ = []

        for elem in search_query['include']:
            _.append("((" + ") OR (".join(elem) + "))")
        search_text = "((" + ") AND (".join(_) + "))"

        if 'not_include' in search_query.keys():
            _ = []
            for elem in search_query['not_include']:
                _.append("((" + ") AND (".join(elem) + "))")
            search_text += " -((" + ") OR (".join(_) + "))"

        if 'from' in search_query.keys():
            search_text += ' (from:' + ' OR from:'.join(search_query['from']) + ')'

        if 'to' in search_query.keys():
            search_text += ' (to:' + ' OR to:'.join(search_query['to']) + ')'

        if 'lang' in search_query.keys():
            search_text += ' lang:' + search_query['lang']

        if 'since' in search_query.keys():
            search_text += f" since:{search_query['since']}"

        if 'until' in search_query.keys():
            search_text += f" until:{search_query['until']}"

        if 'mentioning' in search_query.keys():
            search_text += ' (@' + ' @'.join([search_query['mentioning']]) + ')'
            
        return search_text
    
    
    
    def get_tweet_data(self, exprot_file_name, search_query, max_tweet=None):
        
        ''' 
        *** Extract tweets ***
        :param search_query: (dict or path)
            - If search_query is a path, it should redirect to a .txt file that contains search query you want
            - If search_query is a dict. it should defined like 'create_search_query' method.
        
        :param max_tweet: (int) Ahen reachs to the number, running will stop
        '''
        
        sleep(1)
        self.driver.get('https://twitter.com/explore')
        sleep(4)
        
        
        # Read Unique IDs
        unique_tweet_ids = set()
        if os.path.isfile(DATA_DIR / f'data/{exprot_file_name}.csv'):
            df = pd.read_csv(DATA_DIR / f'data/{exprot_file_name}.csv', encoding="utf-8-sig")
            unique_tweet_ids = set(df.Tweet_ID)
        
        
        # Create Search Query
        search_list = []
        if isinstance(search_query, dict):
            search_list.append(self.create_search_query(search_query))
        else:
            with open(search_query) as fp:
                search_list = fp.read().split('\n')
        
        progress = 1
        for exp_ind, search_text in enumerate(search_list):
            # Enter Search Query in Twitter search box
            sleep(4)
            search = self.driver.find_element('xpath', '//label[@data-testid="SearchBox_Search_Input_label"]')
            search.send_keys(Keys.CONTROL + "a")
            search.send_keys(Keys.DELETE)
            search.send_keys(search_text)
            search.send_keys(Keys.RETURN)

            # Go to 'Latest' tweets
            sleep(10)
            self.driver.find_element('link text', 'Latest').click()

            # Scraping
            last_positon = self.driver.execute_script('return window.pageYOffset;')
            scrolling = True
            list_ = []

            while scrolling:
                sleep(2)
                tweets = self.driver.find_elements('xpath', '//article[@data-testid="tweet"]')  

                for tweet in tweets[-min(24, len(tweets)-1):]:
                    try:
                        reply_to, hashtag = str(), str()
                        text = tweet.find_element('xpath', './/div[@data-testid="tweetText"]').text

                        if 'Replying to \n@' in tweet.text:
                            reply_to = tweet.find_element(
                                'xpath', './/div[@class="css-901oao r-1bwzh9t r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-qvutc0"]'
                            ).text.split()
                            reply_to = [id_ for id_ in reply_to if id_[0] == '@']

                        if '#' in text:
                            words = text.split()
                            hashtag = [word for word in words if word[0] == '#']

                        reply = tweet.find_element('xpath', './/div[@data-testid="reply"]').text
                        retweet = tweet.find_element('xpath', './/div[@data-testid="retweet"]').text
                        like = tweet.find_element('xpath', './/div[@data-testid="like"]').text

                        header = tweet.find_element('xpath', './/div[@data-testid="User-Names"]')
                        _ = header.find_element('xpath', './/time').get_attribute('datetime')
                        datetime = dt.datetime.strptime(_, '%Y-%m-%dT%H:%M:%S.000Z') + timedelta(hours=4.5)

                        info = header.text.split('@')
                        username = info[0].strip('\n')
                        user_id = '@' + info[1].split()[0]

                        tweet_id = sha256("".join([user_id, str(datetime), text]).encode()).hexdigest()


                        if tweet_id not in unique_tweet_ids:
                            sys.stdout.write('\r')
                            sys.stdout.write(f"Saved: {progress - progress % 100}       " +
                                             f"Current_State: [ Search_Query: {exp_ind + 1} from {len(search_list)}    " +
                                             f"Tweet: {progress}    " +
                                             f"Date: {str(datetime)[:7]} ] {' '*20}‌")
                            
                            sys.stdout.flush()
                            progress += 1

                            list_.append([tweet_id, username, user_id, datetime, text, reply_to, hashtag, reply, retweet, like])
                            unique_tweet_ids.add(tweet_id)

                            if len(list_) >= 100 or progress > max_tweet:
                                self.save_file(list_, f'{exprot_file_name}_{exp_ind}')
                                list_ = []

                            if progress > max_tweet:
                                scrolling = False
                                break

                            if progress % 500 == 0:
                                search = self.driver.find_element('xpath', '//label[@data-testid="SearchBox_Search_Input_label"]')
                                search.send_keys(Keys.CONTROL + "a")
                                search.send_keys(Keys.DELETE)
                                search.send_keys(search_text + str(datetime)[:10])
                                search.send_keys(Keys.RETURN)
                                sleep(10)

                    except:
                        pass

                # Scrolling
                scroll_attemp = 0

                while True:
                    self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                    sleep(4)
                    current_positon = self.driver.execute_script('return window.pageYOffset;')

                    if current_positon == last_positon:
                        scroll_attemp += 1  
                        if scroll_attemp >= 5:
                            if len(list_) > 0:
                                self.save_file(list_, f'{exprot_file_name}_{exp_ind}')
                                list_ = []

                            scrolling = False
                            break
                        else:
                            sleep(5)
                    else:
                        last_positon = current_positon
                        break

### 
## Scraping Twitter

In [3]:
options = EdgeOptions()
options.use_chromium = True
options.add_argument("-inprivate")
options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"

driver = Edge('/mnt/f/msedgedriver.exe', options=options)

In [23]:
tweet = Twitter_scraper(driver)

##### 01

In [24]:
tweet.get_tweet_data(
    search_query={'include': [['بورس', 'شاخص کل', 'فرابوس'
                              ],
                              ['شاخص کل', 'شاخص قیمت'
                              ],
                             ],
                  
                  'not_include': [['ایران',
                                  ],
                                  ['آخوند'
                                  ],
                                  ['جمهوری'
                                  ]
                                 ],
                  
                  'lang': 'fa',
                  
                 },
    max_tweet=None,
    exprot_file_name='All_In_One')


# tweet.get_tweet_data(
#     search_query=DATA_DIR / 'data/Input/search_query.txt'
#     max_tweet=None,
#     exprot_file_name='All_In_One')

Saved: 300       Current_State: [ Search_Query: 1 from 1    Tweet: 318    Date: 2022-07 ]                     ‌