In [4]:
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.common.keys import Keys
from time import sleep
import sys
import re
from collections import defaultdict
import pandas as pd
import datetime as dt
from datetime import timedelta
from hashlib import sha256
import os.path
from src import DATA_DIR
from hazm import Normalizer, sent_tokenize, word_tokenize
import plotly.express as px
from collections import Counter
from itertools import groupby
import copy

import warnings
warnings.filterwarnings("ignore")

In [5]:
class Twitter_scraper:
    def __init__(self, driver):
        self.driver = driver
        

    def save_file(self, list_, exprot_file_name):
        df = pd.DataFrame(list_, columns=['Search_Query', 'Tweet_ID',
                                          'Username', 'User_ID',
                                          'DateTime', 'Text',
                                          'Reply_to', 'Hashtag',
                                          'Reply', 'Retweet',
                                          'Like',
                                         ])

        if not os.path.isfile(DATA_DIR / f'data/{exprot_file_name}.csv'):
            df.to_csv(DATA_DIR / f'data/{exprot_file_name}.csv', encoding="utf-8-sig", index=False)

        else:
            df.to_csv(DATA_DIR / f'data/{exprot_file_name}.csv', encoding="utf-8-sig", mode='a', header=False, index=False)
            
            
    
    
    def create_search_query(self, search_query, exclude_replies=False):
        
        '''
        *** Create Search Query ***
        :param search_query: (dict or path) Gets search query. If search_querys type is dictionary, It can have following keys:
            - include: (list) Words that must be in tweet text. Each element is a list. This method will combine different elements with 'AND'
              operator and each element words with 'OR' operator.
              
            - not_include: (list) Words that shouldn't be in tweet text. Each element is a list. This method will combine different elements
              with 'AND' operator and each element words with 'OR' operator.

            - from: (list) Filters tweets that sent from specific Twitter accounts

            - to: (list) Filters tweets that sent in reply to specific Twitter accounts

            - since: (str, samp = “2015-12-21”) Containing tweets that sent since a specific date

            - until: (str, samp = “2022-12-21”) Containing tweets that sent until a specific date

            - mentioning: (list) Containing tweets that mentioning specific Twitter accounts

            - question: ('True', 'False') If True, gets all tweet, that were recognized by twitter AI as a question
                
        '''

        search_text = ''
        
        if 'include' in search_query.keys():
            _ = []
            for elem in search_query['include']:
                if len(elem) == 0:
                    pass

                elif len(elem) > 1:
                    _.append("((" + ") AND (".join(elem) + "))")

                else:
                    _.append("(" + str(elem[0]) + ")")

            if len(search_query['include']) > 1:
                search_text += "(" + " OR ".join(_) + ")"

            else:
                search_text += _[0]

        if 'not_include' in search_query.keys():
            _ = []
            for elem in search_query['not_include']:
                if len(elem) == 0:
                    pass

                elif len(elem) > 1:
                    _.append("((" + ") AND (".join(elem) + "))")

                else:
                    _.append("(" + str(elem[0]) + ")")

            if len(search_query['not_include']) > 1:
                search_text += " -(" + " OR ".join(_) + ")"

            else:
                search_text += f"-{_[0]}"



        if 'from' in search_query.keys():
            search_text += ' (from:' + ' OR from:'.join(search_query['from']) + ')'

        if 'to' in search_query.keys():
            search_text += ' (to:' + ' OR to:'.join(search_query['to']) + ')'

        if 'lang' in search_query.keys():
            search_text += ' lang:' + search_query['lang']

        if 'since' in search_query.keys():
            search_text += f" since:{search_query['since']}"

        if 'until' in search_query.keys():
            search_text += f" until:{search_query['until']}"

        if 'mentioning' in search_query.keys():
            search_text += ' (@' + ' @'.join([search_query['mentioning']]) + ')'

        if exclude_replies:
            search_text += ' exclude:replies'
            
        return search_text



    def get_tweet_data(self, exprot_file_name, search_query, exclude_replies=False, max_tweet=None, error=0):
        # try:
            ''' 
            *** Extract tweets ***
            :param search_query: (dict or path)
                - If search_query is a path, it should redirect to a .txt file that contains search query you want
                - If search_query is a dict. it should defined like 'create_search_query' method.
            
            :param max_tweet: (int) Ahen reachs to the number, running will stop
            '''

            unique_tweet_ids = set()
            
            sleep(1)
            self.driver.get('https://twitter.com/explore')
            sleep(4)
            
            
            # Create Search Query
            search_list = []
            if isinstance(search_query, dict):
                search_list.append(self.create_search_query(search_query, exclude_replies))
            else:
                with open(search_query) as fp:
                    search_list = fp.read().split('\n')
            
            progress = 1
            for exp_ind, search_text in enumerate(search_list):
                # Enter Search Query in Twitter search box
                search = self.driver.find_element('xpath', '//label[@data-testid="SearchBox_Search_Input_label"]')
                search.send_keys(Keys.CONTROL + "a")
                search.send_keys(Keys.DELETE)
                search.send_keys(search_text)
                search.send_keys(Keys.RETURN)

                # Go to 'Latest' tweets
                sleep(2)
                self.driver.find_element('link text', 'Latest').click()

                # Scraping
                scrolling = True
                list_ = []
                j = 1
                k = 0
                flag = 0

                self.driver.execute_script("document.body.style.zoom='50%'")
                sleep(2)

                while scrolling:
                    sleep(0.5)
                    tweets = self.driver.find_elements('xpath', '//article[@data-testid="tweet"]')
                    sleep(0.2)
                    k += 1

                    for tweet in tweets[-min(10, len(tweets)):]:
                        header = tweet.find_element('xpath', './/div[@data-testid="User-Names"]')
                        _ = header.find_element('xpath', './/time').get_attribute('datetime')
                        datetime = dt.datetime.strptime(_, '%Y-%m-%dT%H:%M:%S.000Z') + timedelta(hours=4.5)

                        info = header.find_elements('xpath', './/div[@class="css-1dbjc4n r-1wbh5a2 r-dnmrzs"]')
                        username = info[0].text
                        user_id = info[1].text

                        if not len(user_id):
                            continue

                        tweet_id = sha256("".join([user_id, str(datetime)]).encode()).hexdigest()

                        if tweet_id in unique_tweet_ids:
                            continue
                        
                        unique_tweet_ids.add(tweet_id)


                        reply_to, hashtag = str(), str()
                        text = tweet.find_element('xpath', './/div[@data-testid="tweetText"]').text

                        reply_to = ''
                        if 'Replying to \n@' in tweet.text:
                            _ = tweet.find_elements('xpath', './/div[@class="css-901oao r-1bwzh9t r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-qvutc0"]')
                            
                            if len(_):
                                _ = _[0].text.split()
                                reply_to = [id_ for id_ in _ if id_[0] == '@']

                        if '#' in text:
                            words = text.split()
                            hashtag = [word for word in words if word[0] == '#']

                        reply = tweet.find_element('xpath', './/div[@data-testid="reply"]').text
                        retweet = tweet.find_element('xpath', './/div[@data-testid="retweet"]').text
                        like = tweet.find_element('xpath', './/div[@data-testid="like"]').text


                        sys.stdout.write('\r')
                        sys.stdout.write(f"Error: {error}    " +
                                         f"File: {exprot_file_name[-24:-4]}    " + 
                                         f"Tweet: {progress}    " +
                                         f"Date: {str(datetime)} {' '*10}‌")
                        
                        sys.stdout.flush()
                        progress += 1

                        list_.append([exp_ind, tweet_id, username, user_id, datetime, text, reply_to, hashtag, reply, retweet, like])

                        if len(list_) >= 100:
                            self.save_file(list_, exprot_file_name)
                            list_ = []

                        if max_tweet:
                            if progress > max_tweet:
                                self.save_file(list_, exprot_file_name)
                                list_ = []
                                scrolling = False
                                break


                    # Scrolling
                    scroll_attemp = 0
                    
                    last_positon = self.driver.execute_script('return window.pageYOffset;')

                    while True:
                        self.driver.execute_script(f"window.scrollTo(0, {self.driver.execute_script('return window.pageYOffset;')+450})")
                        sleep(1)
                        current_positon = self.driver.execute_script('return window.pageYOffset;')

                        if current_positon == last_positon:
                            scroll_attemp += 1
                            
                            if scroll_attemp == 2:
                                self.driver.execute_script(f"window.scrollTo(0, {self.driver.execute_script('return window.pageYOffset;')-1800})")
                                sleep(2)
                                self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                                sleep(2)
                            
                            if scroll_attemp >= 3:
                                if len(list_) > 0:
                                    self.save_file(list_, exprot_file_name)
                                    list_ = []

                                scrolling = False
                                break
                            else:
                                sleep(2)
                        else:
                            break

            self.driver.close()

        # except Exception as e:
        #     print(f"error: {e}")

In [6]:
class Gephi_Chart:
    def __init__(self):
        pass


    def replace_compound_words(self, input_, compound_words_path, mode=1):
        with open(compound_words_path) as fp:
            compound_words = fp.read()
        
        compound_words = compound_words.split('\n')
        compound_words = {item:'_'.join(item.split()) for item in compound_words}
        
        if mode == 1:
            for key, value in compound_words.items():
                input_ = input_.str.replace(key, value)
            
        else:
            for key, value in compound_words.items():
                input_ = input_.str.replace(value, key)
                
        return input_


    def create_node_edge(self, tweet_data_dir, theme_dir, output_dir, file_name_, remove_word):
        df = pd.read_csv(tweet_data_dir)
        # df = df.drop_duplicates(subset="Tweet_ID")

        with open(remove_word) as fp:
            remove_ = fp.read()
        remove_ = remove_.split('\n')
        
        for word in remove_:
            df = df[~df['Text'].str.contains(word)]


        tweet_texts = ' Oadf#dsaf#dfa '.join(list(df.Text))
        tweet_texts2 = copy.deepcopy(tweet_texts)

        _ = pd.read_csv(theme_dir)
        themes = {_.iloc[i, 0]:[sha256(_.iloc[i, 1].encode()).hexdigest()[:18], _.iloc[i, 1].replace(" ", "_")] for i in range(len(_))}
        class_ = {_.iloc[i, 1]:_.iloc[i, 2] for i in range(len(_))}

        
        # Detect Nodes
        keys = list(themes.keys())
        keys = sorted(keys, key=len, reverse=True)

        for item in keys:
            tweet_texts = tweet_texts.replace(item, themes[item][0])

        for theme_ in themes.values():
            tweet_texts = tweet_texts.replace(theme_[0], theme_[1])

        filt = [*[item[1] for item in list(themes.values())], 'Oadf#dsaf#dfa']

        _ = list(filter(lambda x: x in filt, tweet_texts.split()))
        tweet_list = [list(group) for k, group in groupby(_, lambda x: x == "Oadf#dsaf#dfa") if not k]

        _ = [(item[0], item[0].replace("_", " "), item[1], class_[item[0].replace("_", " ")]) for item in Counter(_).most_common() if item[0] != "Oadf#dsaf#dfa"]
        df_output_nodes = pd.DataFrame(_, columns=['Id', 'Label', 'Weight', 'Class'])
        df_output_nodes.to_csv(output_dir / f'{file_name_}_nodes.csv', index=False, encoding="utf-8-sig")


        # Detect Edges
        edges = defaultdict(int)

        event_log = []
        
        for ind, tweet in enumerate(tweet_list):
            if len(tweet) >= 4:
                for theme in tweet:
                    event_log.append([ind, theme])
                    
        df_event_log = pd.DataFrame(event_log, columns=['Id', 'Theme'])
        df_event_log.to_csv(output_dir / 'event_log.csv', index=False, encoding="utf-8-sig")
        
        
        for tweet in tweet_list:
            set_ = set()
            if len(tweet) <=1:
                continue

            for i in range(len(tweet)):
                for j in range(i+1, len(tweet)):
                    if tweet[i] != tweet[j]:
                        if (tweet[i], tweet[j]) in set_:
                            continue
                        else:
                            edges[(tweet[i], tweet[j])] += 1
                            set_.add((tweet[i], tweet[j]))

        tweet_list = []
        tweet_list_word = []

        for key, value in edges.items():
            tweet_list.append((*key, value))

        df_output_edges = pd.DataFrame(tweet_list, columns=['Source', 'Target', 'Weight'])
        df_output_edges.to_csv(output_dir / f'{file_name_}_edges.csv', index=False, encoding="utf-8-sig")
        
    
    def remove_stopwords(self, text: str, stopwords: list):
        """
        :param text: text you want to delete stopwords from dat
        :param stopwords: list of stopwords
        """
        tokens = word_tokenize(text)
        tokens = filter(lambda word: word not in stopwords, tokens)
        return " ".join(tokens)


    def most_common_words(self, tweet_data_dir, stopwords_dir, output_dir, remove_word):
        df = pd.read_csv(tweet_data_dir)
        df = df.drop_duplicates(subset = "Tweet_ID")
        
        df.Text = self.replace_compound_words(df.Text, DATA_DIR / 'data/Input/compound_words.txt')
        
        with open(remove_word) as fp:
            remove_ = fp.read()
        remove_ = remove_.split('\n')
        
        for word in remove_:
            df = df[~df['Text'].str.contains(word)]
        
        with open(stopwords_dir) as fp:
            stopwords = fp.read()
        stopwords = stopwords.split('\n')
        
        list_ = list(df.Text)
        list_ = [str(text) for text in list_]
        text = ' '.join(list_)
        text = self.remove_stopwords(text, stopwords)
        text = text.split()
        df = pd.DataFrame(Counter(text).most_common(), columns=['word', 'count'])
        df.word = self.replace_compound_words(df.word, DATA_DIR / 'data/Input/compound_words.txt', mode=2)
        df.to_csv(output_dir, index=False, encoding="utf-8-sig")

### 
## Scraping Twitter

In [50]:
options = EdgeOptions()
options.use_chromium = True
options.add_argument("-inprivate")
driver = Edge(DATA_DIR / 'msedgedriver.exe', options=options)

tweet = Twitter_scraper(driver)


tweet.get_tweet_data(
    search_query={
        'include': [
            ['مهرگیاه', 'دمنوش'],
            ['نیوشا' , 'دمنوش'],
        ],
        
        'lang': 'fa',
        'not_from': ['jahan_n_'],
        'since': '2020-01-01',
        'until': '2022-05-01'
    },

    max_tweet=None,
    exprot_file_name='Herbal_Tea')


KeyboardInterrupt



### 
## Output Analysis

In [9]:
# Most Common Words

chart = Gethi_Chart()

chart.most_common_words(tweet_data_dir = DATA_DIR / 'data/Kitchen.csv',
                        stopwords_dir = DATA_DIR / 'data/Input/persian_stop_words.txt',
                        output_dir = DATA_DIR / 'data/Themes/Kitchen_Most_common.csv',
                        remove_word = DATA_DIR / 'data/Remove.txt'
                       )

##### Node And Edges

In [22]:
chart = Gephi_Chart()

chart.create_node_edge(tweet_data_dir = DATA_DIR / 'data/Gephi/Pack/Tornado Pos.csv',
                       theme_dir = DATA_DIR / 'data/Gephi/Pack/Tornado.csv',
                       output_dir = DATA_DIR / 'data/Gephi/Pack/',
                       file_name_ = 'Tornado_Pos',
                       remove_word = DATA_DIR / 'data/Remove.txt',
                       
                      )