# Just a playground to test various things :)

In [1]:
import pandas as pd
import re
import requests

In [2]:
# Load the JSON file
df_posts = pd.read_json('../data/dataset.json')
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936


In [9]:
df_filtered = df_posts[df_posts['text'].str.contains(r'unity', case=False, na=False)]

df_filtered

Unnamed: 0,timestamp,text,text_id,user,user_id
11,2024-10-31 00:00:01,Discover the rich history of African Americans...,2024298230,kristen10,1040149678
19,2024-10-31 00:00:01,'@myersteresa: @yhawkins @mindy52 @bairdsummer...,2073878409,jgutierrez,1006754286
153,2024-10-31 00:01:29,HUDSON — Hudson Community School District prop...,2025362382,jesse78,1041120696
162,2024-10-31 00:01:38,@daniellelogan: #PENYLAN PCSO Jones is out and...,2015019832,blackwelldiane,1039831910
240,2024-10-31 00:02:58,"Sapang Dalaga MPS PNP Personnel, led by Office...",2022877822,leah53,1036411239
...,...,...,...,...,...
70106,2024-10-31 23:56:06,Exciting opportunity for a Technical Officer i...,2003599756,garciaanna,1089334060
70123,2024-10-31 23:56:40,"Looking for big money, better learning, and hi...",2090908311,mary97,1062003668
70134,2024-10-31 23:57:02,Need to boost your online presence? I can help...,2010995738,montgomerycarlos,1081307182
70137,2024-10-31 23:57:05,Exciting event alert! Dr. Githinji Gitahi disc...,2021972796,kgarcia,1078777110


In [4]:
import pandas as pd
import re
import requests
from concurrent.futures import ThreadPoolExecutor
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from typing import Dict, Set, List
from tqdm import tqdm  # For progress tracking

# Constants
BATCH_SIZE = 1000
MAX_WORKERS = 20

# 1. Optimized URL pattern regex
url_pattern = re.compile(r'(?:https?:\/\/(?:www\.)?|bit\.ly\/)[^\s<>"\']+')

# 2. Session management
def create_session():
    session = requests.Session()
    retry_strategy = Retry(
        total=2,
        backoff_factor=0.5,
        status_forcelist=[500, 502, 503, 504]
    )
    adapter = HTTPAdapter(
        max_retries=retry_strategy,
        pool_connections=100,
        pool_maxsize=100
    )
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

# 3. URL checking function
def check_url_status(url: str) -> int:
    session = create_session()
    try:
        response = session.head(url, timeout=3, allow_redirects=True)
        return response.status_code
    except requests.exceptions.RequestException:
        return "Failed"
    finally:
        session.close()

# 4. Process unique URLs in parallel
def process_unique_urls(urls: Set[str]) -> Dict[str, int]:
    url_status_map = {}
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Create a list of future objects
        future_to_url = {executor.submit(check_url_status, url): url for url in urls}
        
        # Process completed futures as they finish
        for future in tqdm(future_to_url, desc="Checking URLs"):
            url = future_to_url[future]
            try:
                status = future.result()
                url_status_map[url] = status
            except Exception as e:
                url_status_map[url] = "Failed"
    
    return url_status_map

# 5. Process batches of posts
def process_batch(df_batch: pd.DataFrame, url_status_map: Dict[str, int]) -> List[dict]:
    expanded_rows = []
    
    for _, row in df_batch.iterrows():
        links = url_pattern.findall(row['text'])
        for link in links:
            expanded_rows.append({
                'text': row['text'],
                'link': link,
                'status': url_status_map.get(link, "Failed")
            })
    
    return expanded_rows

# 6. Main processing logic
def process_posts(file_path: str) -> pd.DataFrame:
    # Read and sample data
    df_posts = pd.read_json(file_path)
    
    # Extract and deduplicate all URLs
    print("Extracting unique URLs...")
    all_urls = set()
    for text in df_posts['text']:
        all_urls.update(url_pattern.findall(text))
    print(f"Found {len(all_urls)} unique URLs")
    
    # Process unique URLs
    print("Checking URL statuses...")
    url_status_map = process_unique_urls(all_urls)
    
    # Process posts in batches
    all_expanded_rows = []
    total_batches = (len(df_posts) + BATCH_SIZE - 1) // BATCH_SIZE
    
    print("Processing posts in batches...")
    for i in tqdm(range(0, len(df_posts), BATCH_SIZE), total=total_batches):
        batch = df_posts.iloc[i:i + BATCH_SIZE]
        expanded_rows = process_batch(batch, url_status_map)
        all_expanded_rows.extend(expanded_rows)
    
    # Create final dataframe
    return pd.DataFrame(all_expanded_rows)

# Usage
if __name__ == "__main__":
    file_path = '../data/dataset.json'
    df_links = process_posts(file_path)
    print(f"Processed {len(df_links)} total links")

Extracting unique URLs...
Found 19423 unique URLs
Checking URL statuses...


Checking URLs: 100%|██████████| 19423/19423 [07:31<00:00, 43.02it/s] 


Processing posts in batches...


100%|██████████| 71/71 [00:00<00:00, 82.89it/s]

Processed 28236 total links





In [5]:
df_posts[df_posts['text_id'] == 2007641313]['text'].values[0]

'Here we go again! @jessegarcia @donald91 @oschroeder @caitlinoconnell @ashley91 @mallory83 @jacqueline92 @wjohnson @omendez @xrodriguez @vjames 😂'

In [10]:
data = pd.read_csv('../output/preprocessed.csv')

In [7]:
df_preprocessed_filterd = data[data['text'].str.contains(r'uk', case=False, na=False)]

df_preprocessed_filterd

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions,frequency,language
2,00:00:00,uk brace war government building london raise ...,2059143248,ihooper,1007478642,"['#Ukrainewashed', '#WarPreparedness']",[],1,en
9,00:00:00,hey guy instead steal toilet ukraine focus hel...,2059294175,williamellis,1013529471,['#HelpUkraine'],[],1,en
16,00:00:01,kennan word true today tension rise russia rat...,2094157435,nguyenashley,1065143559,"['#War', '#Russia', '#Ukraine', '#Geopolitics']",[],1,en
26,00:00:04,britain democratic process criticize vladimir ...,2072583710,jason42,1063518094,['#Ukraine'],"['andrew07', 'jamesperry', 'brownjames']",1,en
56,00:00:27,miss chance host greatbritishgardenparty raise...,2078726031,brandi66,1042463889,"['#GreatBritishGardenParty', '#fundraising']",[],1,en
...,...,...,...,...,...,...,...,...,...
68384,23:55:52,putin inability win end war ukraine mirror gor...,2045812750,millerchristopher,1096112886,"['#politics', '#Putin', '#RussiaUkraineCrisis']",[],1,en
68486,23:58:13,impact ukraine crisis energy price future caus...,2021630615,williamharris,1096529011,"['#Ukraine', '#energyprices', '#futures']",[],1,en
68490,23:58:18,ukraine political landscape complex think rece...,2015014492,ashley84,1025815116,"['#Ukraine', '#politics']",[],1,en
68496,23:58:29,discover funding option boost innovation busin...,2098428835,fosterchad,1060790646,"['#innovateukedge', '#Solent2050', '#SolentBus...","['steven25', 'sylviaharris']",1,en


In [8]:
non_str_texts = data[~data['text'].apply(lambda x: isinstance(x, str))]
non_str_texts

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions,frequency,language
