## Scraping from xlsx

In [1]:
import pandas as pd
import re
import os

In [2]:
# my_df = pd.read_excel("Reddit Eng-Vie-{i+1}.xlsx")

my_df = pd.read_excel("Reddit Eng-Vie-51.xlsx")

In [3]:
# Extract link - use it to scrape data with Reddit interface
def extract_reddit_link(content):
    match = re.search(r'Link Reddit:\s*(\S*redd\S*)', content)
    return match.group(1) if match else None

# Store username with their comment.
def extract_content_and_username(content):
    pattern = r'u/(\w+)\s*\([^\)]+\)\n(.+?)(?=_{20}|>u/|u/|$)'
    match = re.findall(pattern, content, re.DOTALL)
    return match

In [4]:
def extract_content_username_links(my_df):
    reddit_links = my_df['Vietnamese Content'].apply(extract_reddit_link)

    # Applying function to extract usernames and comments
    content_username = my_df['Vietnamese Content'].apply(extract_content_and_username)

    # Create an empty DataFrame to store the extracted data
    extracted_df = pd.DataFrame(columns=['Reddit Link', 'Username', 'Vietnamese Content'])
    
    # Process each conversation
    for con in range(content_username.size):
        reddit_link = reddit_links[con]  # Get the Reddit link for the current conversation
        conversation = content_username[con]  # Extract the username-content pairs
        
        # List to store the new rows to add
        rows_to_add = []
        
        # Loop through each username and content pair
        for username, content in conversation:
            # Add each row as a dictionary
            rows_to_add.append({
                'Reddit Link': reddit_link,
                'Username': username,
                'Vietnamese Content': content
            })
        
        # Concatenate the new rows with the existing DataFrame
        extracted_df = pd.concat([extracted_df, pd.DataFrame(rows_to_add)], ignore_index=True)

    return extracted_df

In [5]:
extracted_df = extract_content_username_links(my_df)

In [6]:
extracted_df['English Content'] = None

## Scraping from Reddit

In [7]:
# Setting up credential environment
import praw
import csv
import time
import requests
from requests.exceptions import RequestException

reddit = praw.Reddit(client_id='uBbBTQDB5F0pFt9QoeHj0Q',
                     client_secret='44SLQAtT_1GgQ8at6wS7ioayGzXP2g',
                     user_agent='Script by /u/No_Union9101')

In [8]:
def scraping_Reddit(url):
    # Hash map for add optimized
    content_rows = {}
    
    submission = reddit.submission(url=url)

    if submission.author is not None:
        # Add the author and post content
        if submission.selftext != '':
            content_rows[submission.author.name] = [submission.selftext]
    
    submission.comments.replace_more(limit=300)
    
    # Add the commenters and their content
    for top_level_comment in submission.comments.list():

        # Check for empty content or empty body
        if top_level_comment.author is None or top_level_comment.body == '':
            continue

        # Add author to dictionary if they are not included
        if top_level_comment.author.name not in content_rows:
            content_rows[top_level_comment.author.name] = [top_level_comment.body]
        else:
            # Append the content to the list of content per user
            content_rows[top_level_comment.author.name].append(top_level_comment.body)
    
    return content_rows

In [9]:
# Function to handle 429 errors and retry
def handle_429_and_retry(exception, retry_after=300):
    if exception.response and exception.response.status_code == 429:
        print(f"Rate limit exceeded. Sleeping for {retry_after} seconds...")
        time.sleep(retry_after)
    else:
        print(f"An error occurred: {exception}")

In [None]:
import time
from requests.exceptions import RequestException

current_link = extracted_df.loc[0]["Reddit Link"]

# Fetch Reddit content
try:
    content_rows = scraping_Reddit(current_link)
    print("Working on it")

    for index, row in extracted_df.iterrows():
        # Extract the data information from the dataframe
        link = row["Reddit Link"].replace(" ", "")
        username = row["Username"]
        content = row["Vietnamese Content"]

        if not link:  # Skip if link is None or empty
            continue

        # Fetch content if the link changes
        if current_link != link:
            current_link = link
            try:
                content_rows = scraping_Reddit(current_link)
                print(f"Successfully fetched content for {link}")
            except RequestException as re:
                handle_429_and_retry(re)

        # Match the username and assign English content
        if username in content_rows:
            extracted_df.at[index, 'English Content'] = content_rows[username]

except TypeError as te:
    print(f"Type Error occurred: {te}")

except RequestException as re:
    handle_429_and_retry(re)

except Exception as e:
    print(f"Unexpected error: {e}")


Working on it
Successfully fetched content for https://redd.it/114eoci
Successfully fetched content for https://redd.it/91qmh7


In [None]:
extracted_df.head(50)

In [None]:
extracted_df.dropna()

In [None]:
extracted_df.to_excel("matched_content51.xlsx")