In [1]:
"""
Full scan notebook:

"""

'\nFull scan notebook:\n\n'

In [2]:
# Must have imports
import helper
from analyzer import Analyzer
from crawler import Crawler
from plotter import Plotter
from model import TextBlob, TrainedSentimentModel, Vader
import tensorflow

In [3]:
# Extra imports
import datetime
import time
import numpy as np
import tweepy

In [4]:
# Sample Config
config = {
	"user_auth": False, # autheticate as user or application
    "search": {
        "location": "darmstadt", # based on helper.GEOCODES dictionary
        "radius": 100, # optional default to 100
        "query": ['corona'], # query for searching (str array), either query or location has to be not empty
        "max_searches": 5000, # Default: 1000 max amount of searches 
        "num_results": 5000, # number of results with defined filter options
        "rate_limit": True,  # Default True: to turn off rate limit prints
        "filter": { # Filter applies to search
            "not_reply": True, # Filters for not replies when true, does nothing when false
            "not_retweet": True, # Filters for not retweets when true, does nothing when false
            "until": datetime.datetime(2020, 3, 1), # None or datetime (e.g. datetime.datetime(2020, 5, 20))
        }
    },
	"get_user": { # Optional, only when querying for users
		"good_user": True, # mandatory
		"search_type": "recent_user", # 'recent_user', 'recent_retweeted_user'
		"num_users": 30, # mandatory
        "unique_ids": True, # If true will remember user ids in session
	},
	"plot": {
		"title": "Testing",
	},
    # Full search not tested and should only be used with caution!
    "full_search": {
        "query": "#Corona lang:de", # the query used for full search
        "env_name": "dev", # your premium environment name
        "fromDate": "2020" + "01" + "15" + "1200", # Format: YYYYMMDDHHmm
        "toDate": "2020" + "06" + "01" + "1200"
    },
    # A full scan over 3 areas each hour, should run continuous
    "full_scan": {
        "active": True,
        "path": "saved_data/full_scan/",
        "locations": ["scan_1", "scan_2", "scan_3"], # All locations used by the scan
    }
}


In [5]:
# Load configs and init models
config = helper.init_config(config)

modl = TextBlob()
modlUs = TrainedSentimentModel()
vader_model = Vader()

craw = Crawler(config)
anal = Analyzer(config, vader_model)
plot = Plotter(config)
sentiment_pos_limit = 0.7
sentiment_neg_limit = -0.7

In [6]:
now = datetime.datetime.now()
goal = datetime.datetime(2020, 6, 29, 10, 0)
print(goal)
dif = goal - now
print(f"Waiting {dif.seconds} seconds")
# time.sleep(dif.seconds)

2020-06-29 10:00:00
Waiting 85006 seconds


In [None]:
# Analyze user:

craw.full_scan()
# users3 = craw.get_users()
# print(len(users))
# user = craw.get_user_from_id("ZDFheute")
# user = craw.get_user_from_id("BarackObama")
# tweets = craw.get_tweets()
# tweets = craw.get_timeline(users[0])
# len(tweets)

Scanning for Users...
remaining requests: 1286
tweets remaining: ~ 19290
reset at: 2020-06-29 10:38:14
reset in: 14:11
remaining requests: 1021
tweets remaining: ~ 15315
reset at: 2020-06-29 10:38:14
reset in: 13:11
remaining requests: 826
tweets remaining: ~ 12390
reset at: 2020-06-29 10:38:14
reset in: 12:29
(3, 30)
Saving new User List
Scanning and saving tweets: 0
Search reached end point
Checked Tweets: 2915
remaining requests: 668
tweets remaining: ~ 10020
reset at: 2020-06-29 10:38:14
reset in: 11:48
Search reached end point
Checked Tweets: 1001
remaining requests: 617
tweets remaining: ~ 9255
reset at: 2020-06-29 10:38:14
reset in: 11:37
Search reached end point
Checked Tweets: 357
remaining requests: 599
tweets remaining: ~ 8985
reset at: 2020-06-29 10:38:14
reset in: 11:33
Search reached end point
Checked Tweets: 1891
remaining requests: 504
tweets remaining: ~ 7560
reset at: 2020-06-29 10:38:14
reset in: 11:09
Search reached end point
Checked Tweets: 1068
remaining requests:

Rate limit reached. Sleeping for: 557


Search reached end point
Checked Tweets: 532
remaining requests: 1483
tweets remaining: ~ 22245
reset at: 2020-06-29 10:53:19
reset in: 14:55
Search reached end point
Checked Tweets: 2409
remaining requests: 1362
tweets remaining: ~ 20430
reset at: 2020-06-29 10:53:19
reset in: 14:26
Search reached end point
Checked Tweets: 1051
remaining requests: 1309
tweets remaining: ~ 19635
reset at: 2020-06-29 10:53:19
reset in: 14:13
Search reached end point
Checked Tweets: 828
remaining requests: 1267
tweets remaining: ~ 19005
reset at: 2020-06-29 10:53:19
reset in: 14:03
Search reached end point
Checked Tweets: 353
remaining requests: 1249
tweets remaining: ~ 18735
reset at: 2020-06-29 10:53:19
reset in: 14:00
Search reached end point
Checked Tweets: 552
remaining requests: 1221
tweets remaining: ~ 18315
reset at: 2020-06-29 10:53:19
reset in: 13:53
Checked Tweets: 269
remaining requests: 1206
tweets remaining: ~ 18090
reset at: 2020-06-29 10:53:19
reset in: 13:49
Search reached end point
Chec

Rate limit reached. Sleeping for: 547


Checked Tweets: 3167
remaining requests: 1382
tweets remaining: ~ 20730
reset at: 2020-06-29 11:08:25
reset in: 14:30
Search reached end point
Checked Tweets: 407
remaining requests: 1361
tweets remaining: ~ 20415
reset at: 2020-06-29 11:08:25
reset in: 14:25
Search reached end point
Checked Tweets: 477
remaining requests: 1337
tweets remaining: ~ 20055
reset at: 2020-06-29 11:08:25
reset in: 14:20
Search reached end point
Checked Tweets: 266
remaining requests: 1323
tweets remaining: ~ 19845
reset at: 2020-06-29 11:08:25
reset in: 14:17
Checked Tweets: 3205
remaining requests: 1161
tweets remaining: ~ 17415
reset at: 2020-06-29 11:08:25
reset in: 13:40
Search reached end point
Checked Tweets: 2013
remaining requests: 1058
tweets remaining: ~ 15870
reset at: 2020-06-29 11:08:25
reset in: 13:15
Checked Tweets: 3167
remaining requests: 894
tweets remaining: ~ 13410
reset at: 2020-06-29 11:08:25
reset in: 12:34
Search reached end point
Checked Tweets: 624
remaining requests: 862
tweets re

Rate limit reached. Sleeping for: 551


Search reached end point
Checked Tweets: 180
remaining requests: 1498
tweets remaining: ~ 22470
reset at: 2020-06-29 11:23:31
reset in: 14:59
Search reached end point
Checked Tweets: 351
remaining requests: 1480
tweets remaining: ~ 22200
reset at: 2020-06-29 11:23:31
reset in: 14:55
Search reached end point
Checked Tweets: 1938
remaining requests: 1383
tweets remaining: ~ 20745
reset at: 2020-06-29 11:23:31
reset in: 14:34
Search reached end point
Checked Tweets: 219
remaining requests: 1371
tweets remaining: ~ 20565
reset at: 2020-06-29 11:23:31
reset in: 14:31
Search reached end point
Checked Tweets: 161
remaining requests: 1362
tweets remaining: ~ 20430
reset at: 2020-06-29 11:23:31
reset in: 14:29
Search reached end point
Checked Tweets: 2761
remaining requests: 1223
tweets remaining: ~ 18345
reset at: 2020-06-29 11:23:31
reset in: 13:58
Search reached end point
Checked Tweets: 365
remaining requests: 1204
tweets remaining: ~ 18060
reset at: 2020-06-29 11:23:31
reset in: 13:53
Sear

In [None]:
assert False

In [None]:
print(len(users))
print(len(users2))
print(len(users3))

In [None]:
x = craw.load_user_list()
print(len(x[0]))

In [None]:
craw.save_user_list(x)

In [None]:
# tweets = craw.get_full_timeline_until(users[0])
all_tweets = []
number_of_fails = 0
for user in users:    
    full_timeline = craw.get_full_timeline_until(user)
    if full_timeline == []:
        number_of_fails += 1
    else:
        all_tweets.append(full_timeline)
print(number_of_fails)

In [None]:
a = datetime.datetime.now()
b = datetime.datetime(2020,6,22, 12, 00)
print((a-b).seconds/3600 + (a-b).days * 24)
helper.hours_until(b)

In [None]:
flattened = [tweet for tweets in all_tweets for tweet in tweets]
print(len(flattened))
# anal.analyze_and_plot_sentiment_per_week(flattened, 0.7, -0.7)

In [None]:
anal.analyze_and_plot_sentiment_per_week(flattened, 0.8, -0.8)

In [None]:
anal.analyze_and_plot_sentiment_per_week(all_data, 0.5, -0.5)

In [None]:
filename = "saved_data/recent_timeline_tweets_1000.json"
tweets = craw.load_tweet(filename)
# craw.save_tweets(tweets, filename)
print(tweets[0].text)
print(tweets[1].text)
print(tweets[2].text)
print(tweets[3].text)

In [None]:
filename = "saved_data/full_timeline/2_31k.json"
tweets = craw.save_tweets(flattened, filename)

In [None]:
res = anal.analyze_sentiment(tweets)
res2 = anal.analyze_timeline(tweets)
# plot.simple_hist(res["ages"])

In [None]:
len(res2["ages"]["tweets_age"])

In [None]:
dataBlob = []
fullSenti = []
for tweet in tweets:
    senti = modl.get_polarity_without_preprocessing(tweet.text)
    fullSenti.append(senti)
    # TODO kann man auch auslagern
    # Beste Text Blob sentis: -0.7, 0.7
    if senti > 0.7:
        dataBlob.append('pos')
    elif senti < -0.7:
        dataBlob.append('neg')
    else:
        dataBlob.append('neut')

In [None]:
dataUs = modlUs.get_sentiment_labels_batch([tweet.text for tweet in tweets])
dataUs2 = modlUs.get_sentiment_label_without_preprocessing([tweet.text for tweet in tweets])

In [None]:
plot.simple_hist(dataUs)

In [None]:
plot.simple_hist(dataUs2)