In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Removing english text , duplicates and whitespaces

In [4]:
import re
from google.colab import files

def is_telugu(text):
    cleaned = re.sub(r'[^\u0C00-\u0C7F\s]', '', text)
    return cleaned.strip()

def clean_text_lines(text):
    lines = text.split('\n')
    seen = set()
    cleaned_lines = []

    for line in lines:
        telugu_line = is_telugu(line)
        if telugu_line and telugu_line not in seen:
            seen.add(telugu_line)
            cleaned_lines.append(telugu_line)

    return '\n'.join(cleaned_lines)

input_file = "/content/Telugu_text.txt"
with open(input_file, "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_text_lines(raw_text)

output_file = "/content/cleaned_telugu_only.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

# Download the cleaned file
files.download(output_file)

print("Cleaning complete")


# Labeling sentiments to dataset using XLM-RoBERTa


In [None]:
!pip install -q transformers torch tqdm

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model & tokenizer
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

In [7]:
# Labels
labels = ['Negative', 'Neutral', 'Positive']

def get_sentiments_batch(texts, max_length=128):
    encodings = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    encodings = {key: val.to(device) for key, val in encodings.items()}
    with torch.no_grad():
        outputs = model(**encodings)
    probs = softmax(outputs.logits.cpu().numpy(), axis=1)
    sentiments = [labels[i.argmax()] for i in probs]
    return sentiments

input_file = "/content/cleaned_telugu_only.txt"
with open(input_file, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

# Batch processing
batch_size = 64
results = []

for i in tqdm(range(0, len(lines), batch_size)):
    batch = lines[i:i + batch_size]
    try:
        sentiments = get_sentiments_batch(batch)
        results.extend(zip(batch, sentiments))
    except Exception as e:
        print(f"Error in batch {i}: {e}")
        results.extend(zip(batch, ['Error'] * len(batch)))

df = pd.DataFrame(results, columns=["Text", "Sentiment"])
output_file = "/content/telugu_sentiment_xlmroberta.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")

from google.colab import files
files.download(output_file)

print("Sentiment Analysis Complete")

Balancing dataset

In [8]:
import pandas as pd
from sklearn.utils import shuffle

def create_balanced_dataset(csv_path, output_path='balanced_dataset2.csv'):
    # Load the dataset
    df = pd.read_csv(csv_path)

    # Get the minimum count among the classes
    sentiment_counts = df['Sentiment'].value_counts()
    print("Original Sentiment Counts:\n", sentiment_counts)

    min_count = sentiment_counts.min()

    # Sample equal number from each sentiment
    balanced_df = pd.concat([
        df[df['Sentiment'] == 'Positive'].sample(min_count, random_state=42),
        df[df['Sentiment'] == 'Negative'].sample(min_count, random_state=42),
        df[df['Sentiment'] == 'Neutral'].sample(min_count, random_state=42)
    ])

    # Shuffle the dataset
    balanced_df = shuffle(balanced_df, random_state=42)

    # Save the balanced dataset
    balanced_df.to_csv(output_path, index=False)
    print(f"Balanced dataset saved to '{output_path}' with {len(balanced_df)} records.")

create_balanced_dataset("/content/telugu_sentiment_xlmroberta.csv")


Original Sentiment Counts:
 Sentiment
Neutral     121397
Negative     14770
Positive     10557
Name: count, dtype: int64
Balanced dataset saved to 'balanced_dataset2.csv' with 31671 records.