# Full Scale Sentiment Analysis Pipeline
## **Course:** Advanced NLP
### This notebook processes all data, handles interruptions via checkpointing, and generates visualizations.

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ollama
from tqdm import tqdm

## Add scripts path

In [None]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.preprocessor import TextPreprocessor

## Set visual style

In [None]:
sns.set_theme(style="whitegrid")

## 1. Setup & Configuration

In [None]:
DATA_DIR = "../data"
PROCESSED_DIR = "../data/processed"
RESULTS_DIR = "../results"
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

CHECKPOINT_FILE = os.path.join(PROCESSED_DIR, "full_data_checkpoint.csv")
MODEL_NAME = "llama3"
SOURCES = ["kafiha", "TweetyChannel", "radiofarda", "iranintlTV", "bbcpersian"]

## 2. Data Aggregation & Cleaning

In [None]:
all_dfs = []
for source in SOURCES:
    file_path = os.path.join(DATA_DIR, f"{source}_messages.csv")
    if os.path.exists(file_path):
        df_temp = pd.read_csv(file_path)
        df_temp['source'] = source
        all_dfs.append(df_temp)

full_df = pd.concat(all_dfs, ignore_index=True)
full_df['date'] = pd.to_datetime(full_df['date'])

## Fill NaNs

In [None]:
full_df['text'] = full_df['text'].fillna('')
full_df['emoji_reactions'] = full_df['emoji_reactions'].fillna('')

## Preprocessing

In [None]:
preprocessor = TextPreprocessor()
print("Cleaning text (this may take a while)...")
tqdm.pandas()
full_df['clean_post_text'] = full_df['text'].progress_apply(preprocessor.clean_text)
full_df['clean_reactions'] = full_df['emoji_reactions'].apply(preprocessor.clean_reactions)

## Filter noise

In [None]:
full_df = full_df[full_df['clean_post_text'].str.len() > 5].reset_index(drop=True)
print(f"Ready to process {len(full_df)} posts.")

## 3. Inference Loop (With Checkpointing)

In [None]:
def get_sentiment_label(text, reactions):
    """
    Uses Few-Shot Prompting for the Full Pipeline.
    """
    prompt_text = f"متن پست: {text}\nواکنش‌ها: {reactions}"
    
    system_prompt = (
        "تو یک تحلیلگر دقیق احساسات فارسی هستی. بر اساس مثال‌های زیر، پست جدید را طبقه‌بندی کن.\n"
        "دسته‌بندی‌ها: ['خوشحال', 'ناراحت', 'عصبانی', 'مضطرب', 'خنثی', 'نگران']\n\n"
        "--- مثال‌های آموزشی ---\n"
        "مثال ۱ (تبلیغات/خبر): تور استانبول ویژه بلک فرایدی. -> خنثی\n"
        "مثال ۲ (غم/دلتنگی): فراموش کردنت سخته عزیزم. -> ناراحت\n"
        "مثال ۳ (طنز/موفقیت): وای چقدر خوشحالم تموم شد. -> خوشحال\n"
        "مثال ۴ (خشم/اعتراض): چرا وضعیت اینترنت اینطوریه؟ -> عصبانی\n"
        "مثال ۵ (نگرانی/پول): آخر ماه شد و حقوق نریختن. -> نگران\n"
        "--- پایان مثال‌ها ---\n"
        "فقط برچسب را بنویس."
    )

    try:
        response = ollama.chat(model=MODEL_NAME, messages=[
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': prompt_text},
        ])
        label = response['message']['content'].strip()
        valid = ['خوشحال', 'ناراحت', 'عصبانی', 'مضطرب', 'خنثی', 'نگران']
        for v in valid:
            if v in label: return v
        return "خنثی"
    except:
        return "Error"

## Resume Logic

In [None]:
if os.path.exists(CHECKPOINT_FILE):
    processed_df = pd.read_csv(CHECKPOINT_FILE)
    start_idx = len(processed_df)
    print(f"Resuming from index {start_idx}...")
else:
    processed_df = pd.DataFrame()
    start_idx = 0

batch_size = 50
new_rows = []

print("Starting/Resuming Inference...")
for i in tqdm(range(start_idx, len(full_df))):
    row = full_df.iloc[i]
    sentiment = get_sentiment_label(row['clean_post_text'], row['clean_reactions'])
    
    r_dict = row.to_dict()
    r_dict['sentiment'] = sentiment
    new_rows.append(r_dict)
    
    if len(new_rows) >= batch_size:
        temp_df = pd.DataFrame(new_rows)
        processed_df = pd.concat([processed_df, temp_df], ignore_index=True) if not processed_df.empty else temp_df
        processed_df.to_csv(CHECKPOINT_FILE, index=False, encoding='utf-8-sig')
        new_rows = []

## Save remaining

In [None]:
if new_rows:
    temp_df = pd.DataFrame(new_rows)
    processed_df = pd.concat([processed_df, temp_df], ignore_index=True) if not processed_df.empty else temp_df
    processed_df.to_csv(CHECKPOINT_FILE, index=False, encoding='utf-8-sig')

print("Done!")

## 4. Visualization

In [None]:
df = pd.read_csv(CHECKPOINT_FILE)
df['date'] = pd.to_datetime(df['date'])
df['month_year'] = df['date'].dt.to_period('M')

## 1. Trend Line Plot

In [None]:
trend = df.groupby(['month_year', 'sentiment']).size().unstack(fill_value=0).resample('M').sum()
plt.figure(figsize=(14, 7))
for col in ['خوشحال', 'ناراحت', 'عصبانی', 'نگران']:
    if col in trend.columns:
        plt.plot(trend.index.astype(str), trend[col], label=col)
plt.title("Sentiment Trends (2020-2025)")
plt.legend()
plt.xticks(rotation=45)
plt.savefig(os.path.join(RESULTS_DIR, "trend_plot.png"))
plt.show()

## 2. Hope vs Despair Bar Plot

In [None]:
def map_mood(s):
    if s == 'خوشحال': return 'Hope/Positive'
    if s in ['ناراحت', 'عصبانی', 'نگران', 'مضطرب']: return 'Despair/Negative'
    return 'Neutral'

df['mood'] = df['sentiment'].apply(map_mood)
mood_counts = df.groupby(['source', 'mood']).size().unstack(fill_value=0)
mood_counts.plot(kind='bar', color=['red', 'green', 'gray'], figsize=(10,6))
plt.title("Mood Analysis by Channel")
plt.ylabel("Count")
plt.savefig(os.path.join(RESULTS_DIR, "mood_bar.png"))
plt.show()