**INSTALLING DEPENDENCIES**

In [3]:
! pip install praw tqdm transformers accelerate > /dev/null 2>&1

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sys
import csv
import torch
import time
import praw
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, pipeline

In [5]:
import warnings
warnings.filterwarnings('ignore')

**REDDIT API CREDENTIALS**

You need to enter the reddit api secret key and the client id either in a csv/txt file or enter them directly

In [6]:
## Access the reddit_api_credentials ##
with open('reddit_api_key.csv','r') as file:
  csv_reader = csv.reader(file)
  for row in csv_reader:
    client_id = row[0]
    client_secret = row[1]

## Create a reddit object ##
reddit = praw.Reddit(client_id= client_id,
                     client_secret= client_secret,
                     user_agent='YOUR_USER_AGENT')

# FUNCTIONS

**LOADING THE MODELS**

In [7]:
torch.random.manual_seed(0)

def load_model_roberta():
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
                                  MODEL,
                                  device_map="cuda",
                                  torch_dtype="auto"
    )

    return tokenizer, model

def load_model_phi3():
    # Loading model that retrieves quantitative information from qualitative data
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-3-mini-4k-instruct",
        device_map="cuda",
        torch_dtype="auto",
        trust_remote_code=True,
    )

    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }

    return tokenizer, pipe, generation_args

**'Phi-3' Model 1 (retrieves event type, countries invloved, and broad risk dimension and event name)**

In [8]:
def run_model_phi3_1(tokenizer, pipe, generation_args, post):
    time_temp = [] # Time array

    # Start timer
    start_time = time.time()

    messages = [
    #{"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
    {
        "role": "user",
        "content": f"""

        Read the following post title and then answer the questions below.

        Post title: "{post.title}"

        Q1. Is the above event related to any political or economical risk to a country?
        i. Yes
        ii. No

        Q2. Which COUNTRY or COUNTRIES are at Political or Economical risk from the news above? If you can not infer any country then return 'Not mentioned'.

        Q3. Which aspect is affected of the country or countries from the news mentioned above?(Select best one)
        i. Political aspect
        ii. Economical aspect

        Q4. Give a suitable event name for the post tile? Consider examples:
            post title: "Peter Dutton took $23k private jet to News Corp event where he spoke on cost-of-living crisis | Australia news" event name:"Cost of living crisis"
            post title: "Trump threatens to cut off aid to Ukraine 'prior to taking the White House as president-elect'" event name:"Trump's election campaign"
            post title: "Carbon dioxide levels in the atmosphere are surging "faster than ever" to beyond anything humans ever experienced, officials say " event name:"climate crisis/global warming"
            post title: "Russia Bombs Ukraine Superstore With Hundreds Inside " event name:"Russia Ukraine War"
            post title: "“Now we make Russia pay”: EU to transfer first $ 1.6 bn in frozen Russian assets to Ukraine " event name:"Russia Ukraine Conflict escalation"

        Your answer should be in the following format in single line (with in brackets):
        A1[option number], A2[country1, country2, country3, country4, ...], A3[option number], A4[event name]

        """
    },
    ]

    output_1 = pipe(messages, **generation_args)
    #print(output_1[0]['generated_text'])

    # End timer
    end_time = time.time()

    # Calculate elapsed time
    time_temp.append(round(end_time - start_time, 2))
    #times.append(time_temp) # Append time to main array

    return output_1

In [9]:
def get_eventdata(output_1):

    event, countries, broad_risk_dimension, event_name = '', '', '', ''

    for i in range(len(output_1[0]['generated_text'])):

      if output_1[0]['generated_text'][i] == "A" and output_1[0]['generated_text'][i+1] == "1" and output_1[0]['generated_text'][i+2] == "[":
        event = output_1[0]['generated_text'][i+3]
        start = i+6
        break

    if event == 'i':

      for i in range(start , len(output_1[0]['generated_text'])):

        # storing countries to an array
        if output_1[0]['generated_text'][i] == "A" and output_1[0]['generated_text'][i+1] == "2" and output_1[0]['generated_text'][i+2] == "[":
          countries = ''
          for j in range(len(output_1[0]['generated_text']) - i):
            if output_1[0]['generated_text'][i+j] != "]":
              countries += (output_1[0]['generated_text'][i+j+3])
            else:
              break
          countries = countries[:len(countries)-3].split(', ')

        # storing the event class (Politics or Economics)
        if output_1[0]['generated_text'][i] == "A" and output_1[0]['generated_text'][i+1] == "3" and output_1[0]['generated_text'][i+2] == "[":
          broad_risk_dimension = output_1[0]['generated_text'][i+3]
          if output_1[0]['generated_text'][i+4] != ']':
            broad_risk_dimension += output_1[0]['generated_text'][i+4]
          broad_risk_dimension = 'p' if broad_risk_dimension == 'i' else 'e'

        if output_1[0]['generated_text'][i] == "A" and output_1[0]['generated_text'][i+1] == "4" and output_1[0]['generated_text'][i+2] == "[":
          event_name_start = i+3
          break

      for i in range(event_name_start , len(output_1[0]['generated_text'])):
        if output_1[0]['generated_text'][i].isalpha() or output_1[0]['generated_text'][i].isspace():
          event_name += output_1[0]['generated_text'][i]
        if output_1[0]['generated_text'][i+1] == ']':
          break

      return {'Event': event_name, 'Class': broad_risk_dimension, 'Country': countries}

    else:
      return 'Not relevent'

    #print('Event: ', event)
    #print('Countr(ies): ', countries)
    #print('Broad risk dimension: ', broad_risk_dimension)



**'Phi-3' Model 2 (retrieves narrow risk dimension or subclass)**

In [10]:
def run_model_phi3_2(tokenizer, pipe, generation_args, post, countries, broad_risk_dimension):

    # Time array
    time_temp = []

    outputs_2 = []

    if broad_risk_dimension == 'p':

      for i in range(len(countries)):

        # Start timer
        start_time = time.time()

        messages = [
            #{"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
            {
                "role": "user",
                "content": f"""

                Read the following subreddit post title and then answer the questions below.

                Post title: "{post.title}"

                Q1. How will this event affect the {broad_risk_dimension} landscape of {countries[i]}? (only choose one of the option)
                i. War
                ii. Political dividedness
                iii. Scandals
                iv. Cultural Diplomacy
                v. None of the above

                Answer in the following format:
                A1[option number]

                """
            },
        ]

        output_2 = pipe(messages, **generation_args)
        outputs_2.append(output_2)

        #print(output_2[0]['generated_text'])

        # End timer
        end_time = time.time()

        # Calculate elapsed time
        time_temp.append(round(end_time - start_time, 2))

    elif broad_risk_dimension == 'e':

      for i in range(len(countries)):

        # Start timer
        start_time = time.time()

        messages = [
            #{"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
            {
                "role": "user",
                "content": f"""

                Read the following subreddit post title and then answer the questions below.

                Post title: "{post.title}"

                Q1. How will this event affect the {broad_risk_dimension} landscape of {countries[i]}? (only choose one of the option)
                i. Economic instability
                ii. Government Policy
                iii. Economic Stagnation
                iv. None of the above

                Answer in the following format:
                A1[option number]

                """
            },
        ]

        output_2 = pipe(messages, **generation_args)
        outputs_2.append(output_2)

        #print(output_2[0]['generated_text'])

        # End timer
        end_time = time.time()

        # Calculate elapsed time
        time_temp.append(round(end_time - start_time, 2))

    #times.append(time_temp) # Append time to main array

    return outputs_2

In [11]:
def get_subclass(outputs_2, broad_risk_dimension):

    narrow_risk_dimensions = [] # array to store risk measures
    narrow_risk_dimension = ''

    for j in range(len(outputs_2)):
        for i in range(len(outputs_2[j][0]['generated_text'])):
            # storing the risk measures
            if outputs_2[j][0]['generated_text'][i] == "A" and outputs_2[j][0]['generated_text'][i+1] == "1" and outputs_2[j][0]['generated_text'][i+2] == "[":
                narrow_risk_dimension = outputs_2[j][0]['generated_text'][i+3]
                if outputs_2[j][0]['generated_text'][i+4] != ']':
                    narrow_risk_dimension += outputs_2[j][0]['generated_text'][i+4]
                    if outputs_2[j][0]['generated_text'][i+5] != ']':
                        narrow_risk_dimension += outputs_2[j][0]['generated_text'][i+5]
                        if outputs_2[j][0]['generated_text'][i+6] != ']':
                            narrow_risk_dimension += outputs_2[j][0]['generated_text'][i+6]
                break

        if broad_risk_dimension == 'p':
            match narrow_risk_dimension:
              case 'i':
                narrow_risk_dimension = 'War'
                risk_code = 1
              case 'ii':
                narrow_risk_dimension = 'Political dividedness'
                risk_code = 2
              case 'iii':
                narrow_risk_dimension = 'Scandals'
                risk_code = 3
              case 'iv':
                narrow_risk_dimension = 'Cultural Diplomacy'
                risk_code = 4
              case 'v':
                narrow_risk_dimension = 'None'
                risk_code = 0
              case default:
                narrow_risk_dimension = 'None'
                risk_code = 0


        elif broad_risk_dimension == 'e':
            match narrow_risk_dimension:
              case 'i':
                narrow_risk_dimension = 'Economic Instability'
                risk_code = 1
              case 'ii':
                narrow_risk_dimension = 'Government Policy'
                risk_code = 2
              case 'iii':
                narrow_risk_dimension = 'Economic Stagnation'
                risk_code = 3
              case 'iv':
                narrow_risk_dimension = 'None'
                risk_code = 0
              case default:
                narrow_risk_dimension = 'None'
                risk_code = 0

        narrow_risk_dimensions.append(risk_code)

    #print('Narrow risk dimension: ', narrow_risk_dimensions)

    return narrow_risk_dimensions

**PREPROCESS DATA**

The function below checks for any mentions(@...) or links(https://...) in the data and replaces it.

In [12]:
## Preprocess text (username and link placeholders) ##
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

**SENTIMENT ANALYSIS FOR COMMENT**

The following function runs the model with the comment body as input and returns the pos score, neu score and the neg score.

If abs(pos_score - neg_score) <= neu_score: then it classifies it as neutral sentiment and returns 0.

Else if pos_score - neg_score >0: then it classifies it as positive and returns 1.

Else it classifies it as negative and returns -1

In [13]:
## Sentiment Analysis function for Comment ##
def analyse_sentiment_comment(tokenizer, model, post, comment):
  text = post.title + '.' +comment.body
  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt').to('cuda')
  try:
    output = model(**encoded_input)
  except:
    return "Couldn't Analyze"
  scores = output[0][0].detach().cpu().numpy()
  scores = softmax(scores)
  pos_score = scores[2]
  neu_score = scores[1]
  neg_score = scores[0]
  if abs(pos_score - neg_score) <= neu_score:
    return 0
  else:
    if pos_score - neg_score >0:
      return 1
    else:
      return -1

**SENTIMENT ANALYSIS FOR POST**

For a given post, the the sentiment_analysis_comment() function analyzes all the top_level_comments for that post and returns either -1, 0 or 1 based on above logic. Then the score for that particular comment is updated in a dictionary corresponding to the returned value. For example, comment1 is analyzed and the function returns -1. Say, it has 1000 upvotes. Then 1000 is updated corresponding to '-1' in dictionary.

After analysing all the top_level_comments for that post, we obtain a summation of scores corresponding to 1 , 0 , -1. Now we calculate the total score( pos_score + neu_score _ neg_score ) and find the fraction of positive score (pos_score/total score) and fraction of negetive score (neg_score/total score).

To get values from -10 to 10, we multiply the fractions by 10.

Final score is then calculated as negation of overall_pos_score(pos_score/total score *10) - overall_neg_score(neg_score/total score *10) since the score determines the severity of the event(ie, how negative the impact is of the event)

In [14]:
def get_riskscore(tokenizer, model, post):
  sentiment = {1:0 , 0:0 , -1:0, "Couldn't Analyze":0}
  post.comment_sort = 'best'
  post.comments.replace_more(limit=0)
  all_comments = post.comments.list()
  i=0
  while i<501: #Only analyze top 500 comments
    for comment in all_comments:
      if comment.distinguished != 'moderator':
        sentiment[analyse_sentiment_comment(tokenizer, model, post,comment)]+=comment.score
        i+=1
        # for reply in comment.replies:
        #   sentiment[analyse_sentiment_comment(post,reply)]+=comment.score
    break
  # print(sentiment)
  #Calculating sentiment score for post btw -10 and 10
  total_sentiments = sentiment[1] + sentiment[0] + sentiment[-1]
  pos_score = (sentiment[1]/total_sentiments) * 10
  neg_score = (sentiment[-1]/total_sentiments) * 10
  final_score = pos_score - neg_score
  risk_score = -(final_score)
  return risk_score



**GET DATE AND URL OF POST**

In [15]:
def get_metadata(post):
  return {
      'URL': f"https://www.reddit.com{post.permalink}",
      'date': pd.to_datetime(post.created_utc, unit='s').strftime('%Y-%m-%d')
  }


**FILTER DATAFRAME**

In [42]:
def aggregate_urls_and_risk_score(group):
    urls = group['URL'].tolist()
    risk_score = np.average(group['RiskScore'], weights=group['Score'])

    # Create a file name based on the group index
    file_name = 'reddit_'+'_'.join(map(str, group.index)) + '.txt'

    # Define the file path
    path = sys.path[0]
    split_path = path.split('\\Gollum\\reddit-data-pipeline')
    result = split_path[0] + '\\'
    dirPath = result + "DataStore\\"
    # print(dirPath)
    file_path = os.path.join(dirPath, file_name)

    # Write the URLs to the file
    with open(file_path, 'w') as f:
        f.write('\n'.join(urls))


    updated_values = {
        'Date': group['Date'].iloc[0] if group['Date'].count() == 1 else None,
        'RiskScore': risk_score,
        'URL': file_path,
    }


    return updated_values

def filter_dataframe(df):
  filtered_df = df.groupby(['Event', 'Class', 'SubClass', 'Country']).apply(aggregate_urls_and_risk_score).apply(pd.Series).reset_index()
  return filtered_df

**PROCESS ALL POSTS AND SAVE TO DATAFRAME**

In [17]:
## Collect all the top posts of the subreddit and save it to a dataframe ##
columns = ['Event' , 'Class','SubClass' , 'RiskScore' , 'URL' , 'Date' , 'Country','Score']
df_main = pd.DataFrame(columns=columns)

def fetch_posts(subreddit_name, roberta_tokenizer, roberta_model, phi3_tokenizer, phi3_pipe, phi3_generation_args):
  global df_main
  global columns
  subreddit = reddit.subreddit(subreddit_name)
  data=[]
  try:
    for post in subreddit.top(time_filter='month' , limit=1000):
      output_1 = run_model_phi3_1(phi3_tokenizer, phi3_pipe, phi3_generation_args, post)
      event_info = get_eventdata(output_1)
      if event_info != 'Not relevent' and event_info['Country'] != ['Not mentioned'] and post.num_comments>40: #Post is processed only if all requirements met
        metadata = get_metadata(post)
        sub_class = get_subclass(run_model_phi3_2(phi3_tokenizer, phi3_pipe, phi3_generation_args, post, event_info['Country'], event_info['Class']), event_info['Class'])
        risk_score = get_riskscore(roberta_tokenizer, roberta_model, post)
        for i in range(len(sub_class)):
          post_content=[event_info['Event'] ,event_info['Class'] , sub_class[i] ,risk_score ,metadata['URL'] , metadata['date'] , event_info['Country'][i], post.score]
          data.append(post_content)
  except:
    print(f"Could not fetch from given subreddit")

  df=pd.DataFrame(data , columns=columns)
  df_main = pd.concat([df_main, df], ignore_index=True)

  # df.to_csv(f'{subreddit}.csv', index=False)

**READ THE SUBREDDIT NAMES FROM A FILE**

In [18]:
## Getting all subreddit community names from a csv file ##
file_name='subreddits.csv'
subreddit_names = []
with open(file_name , 'r') as file:
  csv_reader = csv.reader(file)
  for row in csv_reader:
    subreddit_names.append(row[0])

# MAIN

**CALL FUNCTIONS TO LOAD THE MODELS**

In [None]:
roberta_tokenizer,roberta_model = load_model_roberta()
phi3_tokenizer, phi3_pipe, phi3_generation_args = load_model_phi3()

In [None]:
for subreddit_name in tqdm(subreddit_names , desc = 'fetching posts from subreddit'):
  fetch_posts(subreddit_name, roberta_tokenizer, roberta_model, phi3_tokenizer, phi3_pipe, phi3_generation_args)
# df_main.to_csv('test2.csv', index=False)

In [22]:
# display(df_main)

Unnamed: 0,Event,Class,SubClass,RiskScore,URL,Date,Country,Score
0,Russia Ukraine Conflict escalation,p,1,5.832826,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-11,Russia,36308
1,Barcelonas tourist apartment regulation change,e,2,9.618131,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-21,Spain,36013
2,RussiaUkraine Conflict escalation,p,1,5.907073,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-26,Ukraine,35608
3,Russia Ukraine Conflict escalation,p,3,9.934957,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-25,Russia,34721
4,Bird flu outbreak in Mexico,p,0,8.28486,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-05,Mexico,33518
5,Russia Ukraine Conflict escalation,p,1,8.820848,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-24,Ukraine,30814
6,Russia Ukraine Conflict escalation,p,1,8.884397,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-08,Russia,30304
7,Russia Ukraine Conflict escalation,p,2,6.652243,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-27,Russia,28218
8,Hajj Pilgrim Deaths in Mecca,p,4,9.242165,https://www.reddit.com/r/worldnews/comments/1d...,2024-06-18,Saudi Arabia,26722


In [43]:
filtered_df = filter_dataframe(df_main)
# display(filtered_df)

Unnamed: 0,Event,Class,SubClass,Country,Date,RiskScore,URL
0,Barcelonas tourist apartment regulation change,e,2,Spain,2024-06-21,9.618131,reddit_1.txt
1,Bird flu outbreak in Mexico,p,0,Mexico,2024-06-05,8.28486,reddit_4.txt
2,Hajj Pilgrim Deaths in Mecca,p,4,Saudi Arabia,2024-06-18,9.242165,reddit_8.txt
3,Russia Ukraine Conflict escalation,p,1,Russia,,7.221086,reddit_0_6.txt
4,Russia Ukraine Conflict escalation,p,1,Ukraine,2024-06-24,8.820848,reddit_5.txt
5,Russia Ukraine Conflict escalation,p,2,Russia,2024-06-27,6.652243,reddit_7.txt
6,Russia Ukraine Conflict escalation,p,3,Russia,2024-06-25,9.934957,reddit_3.txt
7,RussiaUkraine Conflict escalation,p,1,Ukraine,2024-06-26,5.907073,reddit_2.txt


**SAVE THE DATA TO DATABASE**

In [None]:
import sys
sys.path.insert(0, '../')
from writer import add_event, default_table_name
table_name = default_table_name
for _, row in filtered_df.iterrows():
  add_event(Event=row['Event'], Class=row['Class'], SubClass=row['SubClass'], RiskScore=row['RiskScore'], URL=row['URL'], Date=row['Date'], Country=row['Country'], table_name=table_name)