### REQUIRED PACKAGES ###
- numpy
- pandas
- matplotlib
- scipy
- pytrends
`pip install pytrends`
- twarc 
`pip install --upgrade twarc`

*Note: twarc requires credential of a twitter developer account. Follow setup [here](https://twarc-project.readthedocs.io/en/latest/twarc2_en_us/):*

*For ease of reproduction data is provided with code*

In [None]:
import os
import numpy as np
import pandas as pd
import scipy
from scipy import stats, interpolate
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
import datetime
from datetime import date
import matplotlib.dates as mdates
import matplotlib
import matplotlib.cbook as cbook
import random

In [None]:
### pulls twitter data and saves to .csv file ###
def pull_twitter_data(keyword, granularity):
    if not os.path.exists("tweet_data"):
        os.makedirs("tweet_data")
    os.system("twarc2 counts {} --csv --granularity {} > tweet_data/{}.csv".format(keyword, granularity, keyword.replace(" ", "_")))

In [None]:
### get twitter data from saved files ###
def load_twitter_data(keyword, granularity):
    dft = pd.read_csv('tweet_data/{}.csv'.format(keyword.replace(" ", "_")), parse_dates=['start', 'end'])
    dft = dft.sort_values('start')
    twitter_data = dft["{}_count".format(granularity)].to_numpy()
    return twitter_data

In [None]:
### pulls google data and saves to file ###
def pull_google_data(keyword):
    if not os.path.exists("google_data"):
        os.makedirs("google_data")
    today = date.today()
    pytrends = TrendReq(hl='en-US', tz=0)
    pytrends.build_payload([keyword], timeframe = 'now 7-d', geo='')
    dfg = pytrends.interest_over_time()
    dfg.to_csv("google_data/{}.csv".format(keyword.replace(" ", "_")))

In [None]:
### loads google data from saved file and returns as numpy array
def load_google_data(keyword):
    dfg = pd.read_csv("google_data/{}.csv".format(keyword.replace(" ", "_")))
    return dfg[keyword].to_numpy()

In [None]:
# preset keywords based on twitter trends
keywords_ger = ["pistorius", "verteidigungsminister", "parität", "lambrecht", "bargeld", "ryerson", "thelastofus", "davos", "wochenstart", "kompetenz", 
            "leopard2", "rblfcb", "panzer", "ramstein", "schnee", "scholz", "putin", "livche", "russland", "veganer",
           "bvbfca", "wochenstart", "schönensonntag", "übertragungsfehler", "annewill", "ukraine", "panzer", "polen", "arsmun", "impfung"]
granularity = "hour"
creation_date_ger = datetime.datetime(2023,1,23,18)


In [None]:
### pull data for keywords
### !!! overwrites current data files
'''for keyword in keywords:
    pull_twitter_data(keyword, granularity)
    pull_google_data(keyword)'''

In [None]:
### calculates pearson r for given data x and y
def pearson(keywords):
    p_coeffs = []
    for keyword in keywords:
        google_data = load_google_data(keyword)
        twitter_data = load_twitter_data(keyword, granularity="hour")
        twitter_data = (twitter_data-np.amin(twitter_data))/(np.amax(twitter_data)-np.amin(twitter_data))
        google_data = (google_data-np.amin(google_data))/(np.amax(google_data)-np.amin(google_data))
        length = min(len(twitter_data), len(google_data))
        p_coeffs.append(scipy.stats.pearsonr(twitter_data[:length], google_data[:length]))
    return np.asarray(p_coeffs)[:,0]

In [None]:
### plots keyword usage over time in google and twitter
def plot_data(keywords, creation_date):
    if not os.path.exists("images"):
        os.makedirs("images")
    google_length = len(load_google_data(keywords[0]))
    twitter_length = len(load_twitter_data(keywords[0], granularity="hour"))
    datetimes = []
    #t = pd.date_range(start=creation_date, end="{}-{}-{}-{}".format(creation_date.year, creation_date.month, creation_date.day-7, creation_date.hour), periods=7*24*60)
    d=7
    for hh in range(0, max(google_length,twitter_length), 1):
        h = (hh+creation_date.hour)%24
        if h==0: d-=1
        #for m in range(60):
        datetimes.append(datetime.datetime(2023,1,creation_date.day-d,h))
    rs = pearson(keywords)
    for ii, keyword in enumerate(keywords):
        google_data = load_google_data(keyword)
        twitter_data = load_twitter_data(keyword, granularity="hour")
        google_length = len(google_data)
        twitter_length = len(twitter_data)
        twitter_data = (twitter_data-np.amin(twitter_data))/(np.amax(twitter_data)-np.amin(twitter_data))
        google_data = (google_data-np.amin(google_data))/(np.amax(google_data)-np.amin(google_data))
        fig, ax = plt.subplots(figsize=(2, 2))
        #ax.xaxis.set_minor_locator(mdates.HourLocator(byhour=(0,12)))
        #ax.xaxis.set_minor_locator(hours)
        #ax.set_xlim(datetimes[0], datetimes[-1])
        matplotlib.use("pgf")
        matplotlib.rcParams.update({
            "pgf.texsystem": "pdflatex",
            'font.family': 'serif',
            'text.usetex': True,
            'pgf.rcfonts': False,
        })
        ax.plot(datetimes[:google_length], google_data, label="google", color = "orangered", linewidth = "1.0")
        ax.plot(datetimes[:twitter_length], twitter_data, label="twitter", color = "deepskyblue", linewidth = "1.0")
        ax.xaxis.set_major_locator(mdates.DayLocator())
        ax.xaxis.set_minor_locator(mdates.HourLocator(byhour=(12)))
        ax.tick_params(axis='both', which='major', labelsize=5)
        ax.set_xlabel("dates",fontsize = 8)
        ax.set_ylabel("keyword per h (scaled)",fontsize = 8)
        ax.set_xlim(xmin=datetimes[0], xmax=datetimes[-1])
        ax.set_ylim(ymin=0)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.set_title('"'+keyword+'"'+"   PCC: {}".format(round(rs[ii], 3)),fontsize = 8)
        ax.legend(fontsize = 8)
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%d.%m'))
        fig.autofmt_xdate()
        plt.savefig('images/timeseries_{}.pgf'.format(keyword), dpi=500,  bbox_inches="tight")
        plt.show()
        plt.close()

In [None]:
### plotting linear correlation between two set of data points and returning m
def correlation_lg(word,x,y):
    #matplotlib.pyplot.rcdefaults
    #matplotlib.use("agg")
    length = min(len(x), len(y))
    x = x[:length]
    y = y[:length]
    slope, intercept, r, p, std_err = stats.linregress(x,y)
    def myfunc(t):
        return slope*t+intercept
    mymodel = list(map(myfunc, x))
    fig, ax = plt.subplots(figsize=(2,2))
    ax.scatter(x[:length],y[:length], color="black", s=2)
    ax.set_title('"'+word+'"'+"   m: {}".format(slope), fontsize = 8)
    ax.plot(x, mymodel, color="lime")
    plt.show()
    plt.close()

In [None]:
### plot twitter trends 
print(creation_date_ger)
plot_data(keywords_ger, creation_date_ger)
for word in keywords_ger:
    correlation_lg(word, load_google_data(word), load_twitter_data(word, granularity="hour"))

In [None]:
### calculate and print PCC
rs = pearson(keywords_ger)
t_maxes = []
print("Keywords from trends:\nMean: ", np.mean(rs),"  std:", np.std(rs))
for ii, word in enumerate(keywords_ger):
    twitter_data = load_twitter_data(word, granularity="hour")
    t_maxes.append(np.max(twitter_data))
    print(word, rs[ii])

In [None]:
### testing for random word out of most common nouns
dfw = pd.read_csv("nounlist.csv")
words = dfw["ATM"].to_list()
keywords = []

for i in range(30):
    word = random.choice(words)
    keywords.append(word)
    pull_twitter_data(word, granularity="hour")
    pull_google_data(word)
    
creation_date = datetime.datetime.now() 
rs = pearson(keywords)
print("Random keywords:\nMean: ", np.mean(rs),"  std:", np.std(rs))
for ii, word in enumerate(keywords):
    print(word, rs[ii])

In [None]:
### plot random words
plot_data(keywords, creation_date)
for ii, word in enumerate(keywords):
    correlation_lg(word, load_google_data(word), load_twitter_data(word, granularity="hour"))