# Telegram Sentiment Analysis using ParsBERT

This notebook is designed to perform sentiment analysis on Persian texts extracted from Telegram channels. It includes steps for data loading, preprocessing, sentiment prediction using a pre-trained model, temporal analysis, and saving the results.

### 1. Install Required Libraries

In [None]:
# ==============================================================
# üß© 1. Install Required Libraries (Patched & Unified Version)
# ==============================================================
# This block fixes compatibility issues with 'hazm' and ensures a stable installation.

print("‚è≥ Step 1: Upgrading Python build tools...")
# First, upgrade pip, setuptools, and wheel to prevent build errors.
!pip install --upgrade -q pip setuptools wheel

print("‚è≥ Step 2: Cloning and patching 'hazm' for compatibility...")
# The standard 'hazm' library has compatibility issues with recent Python versions in Colab.
# We clone it and patch its configuration file to remove restrictive version constraints.
!git clone https://github.com/sobhe/hazm.git &> /dev/null
!sed -i 's/python = ">=3.8, <3.12"/python = ">=3.8"/' /content/hazm/pyproject.toml
!sed -i 's/numpy = "==1.24.3"/numpy = ">=1.24.3"/' /content/hazm/pyproject.toml

print("‚è≥ Step 3: Installing all required libraries...")
# Install the patched local version of hazm along with all other libraries in a single command.
# This allows pip to resolve all dependencies correctly.
!pip install -q /content/hazm/ \
    transformers==4.36.2 \
    datasets \
    evaluate \
    accelerate \
    scikit-learn \
    pandas \
    matplotlib \
    seaborn \
    emoji \
    torch \
    huggingface_hub

# =========================
# 4. Verify Installation
# =========================
# This step tries to import the libraries to confirm the installation was successful.
try:
    import hazm
    import emoji
    import transformers
    print("\n‚úÖ All libraries installed and imported successfully!")
except ImportError as e:
    print(f"\n‚ùå Installation failed. An essential library could not be imported: {e}")
    print("Please restart the runtime (Runtime > Restart runtime) and try running this cell again.")

### 2. Import Libraries

In [None]:
# =========================
# 2. IMPORT LIBRARIES
# =========================
import os, re, json, glob, emoji
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from hazm import Normalizer
from transformers import pipeline
from datasets import Dataset
import evaluate
from sklearn.model_selection import train_test_split
from google.colab import drive

print("‚úÖ Core libraries imported successfully.")

### 3. Mount Google Drive and Define Paths

In [None]:
# =========================
# 3. MOUNT DRIVE
# =========================
drive.mount('/content/drive')

# IMPORTANT: Change this path to your data folder in Google Drive
DATA_FOLDER = "/content/drive/MyDrive/telegram-sentiment-analysis-fa/data"
RESULTS_FOLDER = "/content/drive/MyDrive/telegram-sentiment-analysis-fa/results"

# Create the results folder if it doesn't exist
os.makedirs(RESULTS_FOLDER, exist_ok=True)
print(f"Data will be read from: {DATA_FOLDER}")
print(f"Results will be saved to: {RESULTS_FOLDER}")

### 4. Load and Merge Data

In [None]:
# =======================================
# 4. LOAD AND MERGE TELEGRAM CSV FILES
# =======================================
print(f"üìÇ Reading CSV files from: {DATA_FOLDER}")
csv_files = glob.glob(os.path.join(DATA_FOLDER, "*.csv"))
dfs = []

if not csv_files:
    print("‚ö†Ô∏è No CSV files found in the specified path. Please check the DATA_FOLDER variable.")
else:
    for f in csv_files:
        try:
            temp_df = pd.read_csv(f)
            # Extract channel name from the filename
            channel_name = os.path.basename(f).replace("_messages.csv", "")
            temp_df['channel'] = channel_name
            temp_df['source_file'] = os.path.basename(f)
            dfs.append(temp_df)
        except Exception as e:
            print(f"Error reading file {f}: {e}")
            continue

    if not dfs:
        raise SystemExit("‚ùå No dataframes were loaded. Halting execution.")

    df = pd.concat(dfs, ignore_index=True)
    print(f"üìä Final dataframe created with {len(df)} rows.")
    display(df.head())

### 5. Text Preprocessing

In [None]:
# =============================
# 5. PERSIAN TEXT PREPROCESSING
# =============================
normalizer = Normalizer()

def preprocess_persian(text):
    if not isinstance(text, str):
        return ""
    # Remove HTML tags and links
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    # Normalize with Hazm
    text = normalizer.normalize(text)
    # Remove emojis
    text = emoji.replace_emoji(text, replace=' ')
    # Remove unnecessary characters (keeps Persian alphabet, spaces, and ZWNJ)
    text = re.sub(r'[^\w\sÿ¢ÿßÿ®Ÿæÿ™ÿ´ÿ¨⁄Üÿ≠ÿÆÿØÿ∞ÿ±ÿ≤⁄òÿ≥ÿ¥ÿµÿ∂ÿ∑ÿ∏ÿπÿ∫ŸÅŸÇ⁄©⁄ØŸÑŸÖŸÜŸàŸá€åŸî‚Äå-]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("üßπ Preprocessing the 'post_text' column...")
df['clean_post_text'] = df['post_text'].apply(preprocess_persian)

# Drop rows where the cleaned text is empty
df.dropna(subset=['clean_post_text'], inplace=True)
df = df[df['clean_post_text'] != '']

print("Sample of original vs. cleaned text:")
display(df[['post_text', 'clean_post_text']].head())

### 6. Sentiment Analysis

In [None]:
# =====================================================
# 6. DEFINE MODEL AND LABELS FOR SENTIMENT ANALYSIS
# =====================================================
# We will use a pre-trained model for sentiment analysis.
# This model classifies text into "positive", "negative", and "neutral" categories.
SENT_MODEL_NAME = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary"
device = 0 if torch.cuda.is_available() else -1 # Use GPU if available
sentiment_pipeline = pipeline("sentiment-analysis", model=SENT_MODEL_NAME, device=device)

# Define final labels for classification
label_list = ["happy", "sad", "neutral"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

def predict_sentiment(texts):
    results = []
    # To prevent memory errors, we process the data in batches.
    # Using the pipeline directly on a list is highly efficient.
    predictions = sentiment_pipeline(texts, batch_size=64, truncation=True)
    for out in predictions:
        label = out['label'].lower()
        if 'pos' in label:
            results.append('happy')
        elif 'neg' in label:
            results.append('sad')
        else:
            results.append('neutral')
    return results

print(f"üß† Sentiment model loaded on device: {'GPU' if device == 0 else 'CPU'}. Predicting sentiment for all messages...")
# Use the cleaned text for prediction
texts_to_analyze = df['clean_post_text'].tolist()
df['pred_label'] = predict_sentiment(texts_to_analyze)
df['pred_label_id'] = df['pred_label'].map(label2id)

print("\n‚úÖ Sentiment analysis completed successfully.")
print("\nDistribution of predicted sentiments:")
print(df['pred_label'].value_counts())

print("\nSample of predictions:")
display(df[['clean_post_text', 'pred_label']].head())

### 7. Temporal Analysis

In [None]:
# =========================
# 7. TEMPORAL ANALYSIS
# =========================
print("üìà Preparing for temporal analysis...")
# Convert timestamp column to datetime format, coercing errors to NaT (Not a Time)
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df.dropna(subset=['timestamp'], inplace=True)

if not df.empty:
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.to_period('M')

    # Monthly analysis
    monthly_trends = df.groupby(['month', 'pred_label']).size().unstack(fill_value=0)
    monthly_trends = monthly_trends.sort_index()

    plt.style.use('seaborn-v0_8-whitegrid')
    plt.figure(figsize=(15, 7))
    for label in label_list:
        if label in monthly_trends.columns:
            plt.plot(monthly_trends.index.to_timestamp(), monthly_trends[label], label=label, marker='o', linestyle='-')
    plt.legend()
    plt.title("Monthly Sentiment Trends in Telegram Posts")
    plt.xlabel("Month")
    plt.ylabel("Number of Posts")
    plt.show()

    # Yearly analysis
    yearly_dist = df.groupby(['year', 'pred_label']).size().unstack(fill_value=0)
    yearly_dist.plot(kind='bar', stacked=True, figsize=(12, 7))
    plt.title("Yearly Sentiment Distribution")
    plt.xlabel("Year")
    plt.ylabel("Number of Posts")
    plt.xticks(rotation=45)
    plt.show()
else:
    print("‚ö†Ô∏è No valid timestamps found for temporal analysis.")

### 8. Save Results

In [None]:
# =========================
# 8. SAVE RESULTS
# =========================
output_csv_path = os.path.join(RESULTS_FOLDER, "telegram_posts_with_sentiment.csv")
output_json_path = os.path.join(RESULTS_FOLDER, "telegram_posts_with_sentiment.json")

print(f"üíæ Saving results to: {RESULTS_FOLDER}")
# Save CSV with proper encoding for Persian characters
df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

# Save JSON with proper encoding for Persian characters
df_for_json = df[['clean_post_text', 'pred_label', 'timestamp', 'channel']]
df_for_json.to_json(output_json_path, orient='records', force_ascii=False, lines=True, date_format='iso')

print(f"\nüéâ Process finished! Results have been saved to the following files:")
print(f"- {output_csv_path}")
print(f"- {output_json_path}")