# Read Dataset

In [26]:
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError



In [27]:
# Load the CSV file
df = pd.read_csv("Finance_50K_English.csv")

# Display the first few comments
print("Loaded comments:")
print(df["text"].head())

#convert to a list for processing
comments = df["text"].tolist()
topics = df["label"].tolist()


Loaded comments:
0     picking up ~~pennies~~ k in front of a bulldozer
1    for months it wont meet the tests for without ...
2    gamble it again and you wont even have to go t...
3    i would not expect a lot of rate cuts unless t...
4                    thought this was common knowledge
Name: text, dtype: object


# Configurations for the API 

In [28]:
import google.generativeai as genai

#genai.configure(api_key="THIS WAS MY API KEY !!")
genai.configure(api_key="AIzaSyCxjP6ctg9a8156sGUE9IhC2NVRc14R5jc")
model = genai.GenerativeModel("gemini-2.0-flash")

## Set the prompt for the Gemini to do the labeling task

In [29]:
def format_prompt(comment,topic):
    return f"""You are a sentiment analysis classifier. For each input, respond with only one of the following labels:
    'Very positive', 'positive', 'neutral', 'negative', 'very negative'. Do not include any explanations or additional text. Just respond with the label.
    The data is a comment from reddit website from Finance-Related area.

The Topic of the comment is: "{topic}"
Classify the sentiment of the following comment:
"{comment}"
"""

In [30]:
def classify_sentiment(comment,topic):
  response = model.generate_content(format_prompt(comment,topic))
  label = response.text.strip()
  return label

A little test

In [31]:
for i in range(5):
  print(classify_sentiment(comments[i],topics[i])+"---"+topics[i]+"---"+comments[i])
  

negative---r/wallstreetbets---picking up ~~pennies~~ k in front of a bulldozer
neutral---r/tax---for months it wont meet the tests for without recapture
positive---r/wallstreetbets---gamble it again and you wont even have to go through college why stop now have some real ambition
negative---r/investing---i would not expect a lot of rate cuts unless trump fires powell and implements erdogan onomics
neutral---r/wallstreetbets---thought this was common knowledge


# We Need Something Faster for 50K calls

In [32]:
# --- classify function with timeout and retry ---
def classify_with_retry(text, label, timeout=20):
    attempt = 1
    while True:
        with ThreadPoolExecutor(max_workers=1) as single_executor:
            future = single_executor.submit(classify_sentiment, text, label)
            try:
                return future.result(timeout=timeout)
            except TimeoutError:
                print(f"Timeout (attempt {attempt}). Retrying...")
                attempt += 1
            except Exception as e:
                print(f"Error: {e}. Retrying...")
                attempt += 1

# --- Parallel processing function ---
def process_row(index, row):
    if row["predicted_label"] != "":
        return index, row["predicted_label"]

    text = row["text"]
    label = row["label"]
    print(f"Processing row {index}")
    prediction = classify_with_retry(text, label)
    return index, prediction

# ITERATIVE SINGLE-THREAD APPROACH

In [None]:

# Add a column for the lables if it doesn't exist yet
if "predicted_label" not in df.columns:
    df["predicted_label"] = ""

# Loop through each row
for i in range(len(df)):
    # Skip already processed rows
    if df.loc[i, "predicted_label"] != "":
        continue

    label = df.loc[i, "label"]
    text = df.loc[i, "text"]
    print(f"\n[{i}] Label: {label}")
    print(f"Text: {text}")

    
    
    prediction = classify_with_retry(text, label)

    # Store the prediction directly in the DataFrame
    df.at[i, "predicted_label"] = prediction

    # Optional delay (e.g., API wait)

    # Save every 100 predictions or at the end
    if i % 100 == 0 or i == len(df) - 1:
        df.to_csv("classified_reddit_finance_comments.csv", index=False)
        print(f"💾 Progress saved at row {i}")

print("All done!")



[0] Label: r/wallstreetbets
Text: picking up ~~pennies~~ k in front of a bulldozer
💾 Progress saved at row 0

[1] Label: r/tax
Text: for months it wont meet the tests for without recapture

[2] Label: r/wallstreetbets
Text: gamble it again and you wont even have to go through college why stop now have some real ambition

[3] Label: r/investing
Text: i would not expect a lot of rate cuts unless trump fires powell and implements erdogan onomics

[4] Label: r/wallstreetbets
Text: thought this was common knowledge

[5] Label: r/wallstreetbets
Text: keep that k safe but lets ride the bull wave whats your next big bet

[6] Label: r/investing
Text: it also takes a supernova to create goldso that’s different

[7] Label: r/wallstreetbets
Text: thanks man i’ve googled the hell out puts options and you couldn’t have shed better light on it

[8] Label: r/wallstreetbets
Text: tsla pump is more fake than bezos wife’s boobies

[9] Label: r/personalfinance
Text: a year should be enough time to start 

KeyboardInterrupt: 

# PARALLEL MULYI-THREAD APPROACH(13X FASTER)

In here, I just tested 3K calls and labeling to show the code is working, But we run it for several hours to get 50K labels for our dataset

In [None]:
# --- Main ---
if "predicted_label" not in df.columns:
    df["predicted_label"] = ""

max_threads = 10  # test with 10–30 based on API stability
futures = []
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    for i, row in df.iterrows():
        if df.at[i, "predicted_label"] == "":
            futures.append(executor.submit(process_row, i, row))

    for idx, future in enumerate(as_completed(futures)):
        try:
            i, prediction = future.result()
            df.at[i, "predicted_label"] = prediction
        except Exception as e:
            print(f"Error processing a row: {e}")
        
        # Save every 100 or at the end
        if idx % 100 == 0 or idx == len(futures) - 1:
            df.to_csv("classified_reddit_finance_comments.csv", index=False)
            print(f"Saved progress at batch {idx}")

print("Done!")

Processing row 0
Processing row 1
Processing row 2
Processing row 3
Processing row 4
Processing row 5
Processing row 6
Processing row 7
Processing row 8
Processing row 9
Processing row 10
Processing row 11
Processing row 12
Processing row 13
Processing row 14
Processing row 15
💾 Saved progress at batch 0
Processing row 16
Processing row 17
Processing row 18
Processing row 19
Processing row 20
Processing row 21
Processing row 22
Processing row 23
Processing row 24
Processing row 25
Processing row 26
Processing row 27
Processing row 28
Processing row 29
Processing row 30
Processing row 31
Processing row 32
Processing row 33
Processing row 34
Processing row 35
Processing row 36
Processing row 37
Processing row 38
Processing row 39
Processing row 40
Processing row 41
Processing row 42
Processing row 43
Processing row 44
Processing row 45
Processing row 46
Processing row 47
Processing row 48
Processing row 49
Processing row 50
Processing row 51
Processing row 52
Processing row 53
Processing