In [9]:
from flask import Flask
import requests
import os
import re
import json
import csv
from datetime import datetime, timedelta
import dateutil.parser
import unicodedata
import time

import snscrape.modules.twitter as sntwitter
import atexit
from apscheduler.schedulers.background import BackgroundScheduler

In [7]:
import pandas as pd
import re
from textblob import TextBlob
import csv
import numpy as np
import nltk
import matplotlib.pyplot as plt 
import seaborn as sns
from wordcloud import WordCloud

In [14]:
from dash import Dash
from dash import dcc
from dash import html


In [3]:
from tqdm import tqdm

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def filter_punc(text):
    punc = '"$%&\'()*+-/:<=>?@[\\]^_`{|}~'
    temp = ''.join([c for c in text if ord(c)<128])
    return temp.translate(str.maketrans('', '', punc))


In [4]:
def run_processing_steps(tweets_df):
    
    tweets_df.drop_duplicates(inplace=True)
    # 先drop
    tweets_df.dropna(subset=['content'], inplace=True)
    
    tmp = tweets_df.content.str.findall("#\w+")
    # print(tmp[tmp.apply(lambda x: x!=[])]) # 查看存在#的行

    tmp = tweets_df.content.str.findall("@\w+")
    # print(tmp[tmp.apply(lambda x: x!=[])]) # 查看存在@的行
    
    # 移除content的@ 和 \n
    tweets_df.content = tweets_df.content.str.replace("\n", "")
    # 去除https
    tweets_df.content = tweets_df.content.str.replace(r"https*\S+", "", regex=True)
    # &符号 -> and
    tweets_df.content = tweets_df.content.str.replace("&amp;", "and")
    tweets_df.content = tweets_df.content.str.replace(r"@\w+", "", regex=True)
    # 1提取tag
    tmp = tweets_df.content.str.findall("#\w+")
    # print(tmp[tmp.apply(lambda x: x!=[])]) # 查看存在#的行

    # 新建列接受#的内容，列`tags`
    tweets_df['tags'] = tmp # 存为list，新的一列
    # print(tmp)

    # 2 并移除# 【但保留word】 因为【很多#，是内容的一部分】
    # tweets_df.content = tweets_df.content.str.replace(r"#\w+", "", regex=True)
    tweets_df.content = tweets_df.content.str.replace(r"#", " ", regex=True)
    
    tweets_df.content = tweets_df.content.apply(lambda x: deEmojify(x))
    tweets_df.content = tweets_df.content.str.replace(r'\s{2,}', " ", regex=True)
    tweets_df.content = tweets_df.content.apply(filter_punc)
    
    # 处理完所有的 再drop nan
    tweets_df.dropna(subset=['content'], inplace=True)
    return tweets_df

In [5]:
def get_tweets(keyword, start_time, end_time, limit):
    tweets_list = []
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{keyword} since_time:{start_time} until_time:{end_time} lang:en').get_items()):
        if i > limit:
            break
        tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.lang, tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount])
    tweets_df = pd.DataFrame(tweets_list, columns=['datetime', 'id', 'content' ,'username', 'language', 'reply_count', 'retweet_count', 'like_count', 'quote_count'])
    return tweets_df

In [6]:
def sentiment_analysis(df):
    sentiments_r = dict()
    Russia_sentiment = []
    for ind, row in df.iterrows():
        content = row.content
        blob = TextBlob(content)
        score = blob.sentiment.polarity
        Russia_sentiment.append(score)
        if score > 0:
            sentiments_r[content] = 1
        elif score == 0:
            sentiments_r[content] = 0
        else:
            sentiments_r[content] = -1
    label = list(sentiments_r.values())
    df_label= pd.DataFrame({'label':label})
    df = df.join(df_label)
    return df

In [20]:
def twitter_bot(limit, duration_in_min):
    current_dt = datetime.now()
    new_dt = current_dt - timedelta(hours=0,minutes=duration_in_min)
    end_time = int(current_dt.timestamp())
    start_time = int(new_dt.timestamp())
    
    #path = r"/Users/dingni/Desktop/NUS MComp/CS5425/5425-Project/"
    print("running")
    
    filename = "labeled_tweets_"+ current_dt.strftime("%Y-%m-%d %H:%M") 

    try:
        # print("Getting Tweets for time between ", new_dt.strftime("%Y-%m-%d %H:%M"), ' and ',
        #      current_dt.strftime("%Y-%m-%d %H:%M"))
        tweets_df_ukraine = get_tweets('Ukraine', start_time, end_time, limit)
        tweets_df_russia = get_tweets('Russia', start_time, end_time, limit)
        processed_df_ukraine = run_processing_steps(tweets_df_ukraine)
        processed_df_russia = run_processing_steps(tweets_df_russia)
        df_russia = sentiment_analysis(processed_df_russia)
        df_ukraine = sentiment_analysis(processed_df_ukraine)
        
        ##pending display part
        
        # tweets_df_ukraine.to_csv(f"../scraped_tweets/{filename}_Ukraine.csv")
        # df_russia.to_csv(f"../scraped_tweets/{filename}_Russia.csv")
        return new_dt, current_dt
    except Exception as e:
        print('It is not working...')
        print(e)
    

In [21]:
application = Flask(__name__)

@application.route("/")

def job():
    new_dt, current_dt = twitter_bot(100, 1)
    print("Successfully scraped and processed tweets for time between ", new_dt.strftime("%Y-%m-%d %H:%M"), ' and ',
             current_dt.strftime("%Y-%m-%d %H:%M"))

scheduler = BackgroundScheduler()
scheduler.add_job(func=job, trigger="interval", minutes = 5)
scheduler.start()

atexit.register(lambda: scheduler.shutdown())
if __name__ == "__main__":
    application.run(port=5000)

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000 (Press CTRL+C to quit)


running
Successfully scraped and processed tweets for time between  2022-04-30 22:27  and  2022-04-30 22:28
running
Successfully scraped and processed tweets for time between  2022-04-30 22:27  and  2022-04-30 22:28
running
Successfully scraped and processed tweets for time between  2022-04-30 22:28  and  2022-04-30 22:29
running
Successfully scraped and processed tweets for time between  2022-04-30 22:28  and  2022-04-30 22:29
running
Successfully scraped and processed tweets for time between  2022-04-30 22:29  and  2022-04-30 22:30
running
Successfully scraped and processed tweets for time between  2022-04-30 22:29  and  2022-04-30 22:30
running
Successfully scraped and processed tweets for time between  2022-04-30 22:30  and  2022-04-30 22:31
running
Successfully scraped and processed tweets for time between  2022-04-30 22:30  and  2022-04-30 22:31
