In [None]:
from googleapiclient import discovery
import pandas as pd
import time
import re
import json

### Define functions

In [None]:
# define function for text preprocessing 
def prepare_text(text):
    text = re.sub(r"@[A-Za-z0-9_]+", ' ', text) # remove @user 
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text) # remove links
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text) # remove smileys
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove any other special characters
    text = re.sub('#', '', text) # remove hash sign
    text = re.sub('\t', ' ',  text) # remove tab
    text = re.sub(r" +", ' ', text) # remove multiple whitespaces
    return text

### Set up API

In [None]:
creds = pd.read_csv('../input/creds-perspective-api/CredentialsPerspectiveAPI.csv')
api_key = creds['api_key'].iloc[0]

client = discovery.build("commentanalyzer",
                         "v1alpha1",
                         developerKey=api_key,
                         discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1"
                        )

### Prepare data

In [None]:
# read data: 
dtype={'text': str, 'id': str, 'tweet_id': str, 'title': str, 'outlet': str, 'twitter_handle': str, 'article_url': str, 'adfontes_url': str, 'bias_score': float, 'reliability_score': float}

# note: this scrip was implemented using Kaggle's GPU. The dataset 'all_tweets_final' was loaded into the Kaggle repo and accessed from there
all_tweets = pd.read_csv('../input/all-tweets-final/all_tweets_final.csv', dtype=dtype)

In [None]:
# split data in subsessions as max Kernel runtime is 9 hours; Perspective API 1 request per second
#all_tweets = all_tweets[0:25115] # sub1: 0 - 25114
#all_tweets = all_tweets[25115:50230] # sub2: 25115 - 50229
#all_tweets = all_tweets[50230:75345] # sub3: 50230 - 75344
#all_tweets = all_tweets[75345:100460] # sub4: 75345 - 100459
#all_tweets = all_tweets[100460:125575] # sub5: 100460 - 125574
#all_tweets = all_tweets[125575:150690] # sub6: 1255575 - 150689
all_tweets = all_tweets[150690:len(all_tweets)] # sub7: 150690 - 175806

In [None]:
# apply text preprocessing to tweets:
all_tweets['text_prepared'] = all_tweets['text'].apply(prepare_text)

In [None]:
tweets = all_tweets['text_prepared'].tolist()
tweet_ids = all_tweets['id'].tolist()

In [None]:
# note: for more information about the PerspectiveAPI attributes see https://developers.perspectiveapi.com/s/about-the-api-attributes-and-languages

responses={} # empty dictionary to store responses in it

for tweet, tweet_id in zip(tweets, tweet_ids):
    analyze_request = {'comment': { 'text': tweet }, 
                       'requestedAttributes': {'TOXICITY': {}, 
                                               'SEVERE_TOXICITY': {},
                                               'IDENTITY_ATTACK': {},
                                               'INSULT': {},
                                               'PROFANITY': {}, 
                                               'THREAT': {},
                                               'SEXUALLY_EXPLICIT' : {}, 
                                               'FLIRTATION': {},
                                               'ATTACK_ON_AUTHOR': {},
                                               'ATTACK_ON_COMMENTER': {},
                                               'INCOHERENT': {},
                                               'INFLAMMATORY': {},
                                               'LIKELY_TO_REJECT' : {},
                                               'OBSCENE': {},
                                               'SPAM': {},
                                               'UNSUBSTANTIAL': {}},
                       'languages': 'en'
                      }
    responses[tweet_id]=client.comments().analyze(body=analyze_request).execute()
    time.sleep(1.1) # one query per 1.1 seconds to not exceed API limit

In [None]:
# save responses as json; # note: this scrip was implemented using Kaggle's GPU. The classified datasets were saved into the Kaggle repo and downloaded manually
#filename = 'responses_sub1.json' # sub1
#filename = 'responses_sub2.json' # sub2
#filename = 'responses_sub3.json' # sub3
#filename = 'responses_sub4.json' # sub4
#filename = 'responses_sub5.json' # sub5
#filename = 'responses_sub6.json' # sub6
filename = 'responses_sub7.json' # sub7

with open(filename, 'w') as fp:
    json.dump(responses, fp)