In [1]:
import os
from indicnlp import common
from indicnlp.tokenize import indic_tokenize

# Set up resource path
INDIC_NLP_RESOURCES = "D:/A Internship Documents/Project G/indic_nlp_resources"
common.set_resources_path(INDIC_NLP_RESOURCES)

# Tokenize
text = "नेपालको राजधानी काठमाडौं हो, जहाँ धेरै सांस्कृतिक स्थलहरू छन्। विद्यार्थीहरूले पुस्तकहरू पढ्न र अनुसन्धान गर्न मन पराउँछन्।"
tokens = list(indic_tokenize.trivial_tokenize(text, lang='ne'))
print(tokens)

['नेपालको', 'राजधानी', 'काठमाडौं', 'हो', ',', 'जहाँ', 'धेरै', 'सांस्कृतिक', 'स्थलहरू', 'छन्', '।', 'विद्यार्थीहरूले', 'पुस्तकहरू', 'पढ्न', 'र', 'अनुसन्धान', 'गर्न', 'मन', 'पराउँछन्', '।']


In [1]:
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv()  # Load from .env

try:
    genai.configure(api_key=os.environ["GEMINI_API_KEY"])
except KeyError:
    print("Error: The 'GEMINI_API_KEY' environment variable is not set.")
    print("Please set your API key before running the script.")
    exit()


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import os
import json
import time
import random
from dotenv import load_dotenv
from google import genai
from google.genai import types

# Load environment variables
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not set in .env file")

# Initialize the Gemini client
client = genai.Client(api_key=api_key)

# Define the model to use
model_name = "models/gemini-1.5-flash"  # Using Flash model to avoid rate limit issues

# Define categories for classification
categories = ["राजनीति", "मनोरञ्जन", "खेलकुद", "अपराध", "स्वास्थ्य", "विज्ञान"]

# Path to the JSON file
json_file_path = r"D:\.A Internship Documents\Project G\cleaned_data\cleaned_articles_20250916_173228.json"

# Function to classify text
def classify_text_with_backoff(text, labels, max_retries=5):
    labels_str = ", ".join(labels)
    prompt = f"Classify the following Nepali text into one of these categories: {labels_str}.\n\nText: \"{text}\"\n\nCategory:"
    
    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model=model_name,
                contents=prompt,
                config=types.GenerateContentConfig(
                    temperature=0.0,
                    max_output_tokens=50
                )
            )
            return response.text.strip()
        except Exception as e:
            if "503" in str(e) or "model is overloaded" in str(e).lower():
                wait_time = (2 ** attempt) + random.uniform(0, 1)  # Exponential backoff with jitter
                print(f"Model overloaded. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                raise
    return "Error: Max retries reached due to model overload."

# Read and process the JSON file
try:
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    articles = data.get("articles", [])
    if not articles:
        print("No articles found in the JSON file.")
    
    for article in articles:
        title = article.get("original_title")
        if title:
            category = classify_text_with_backoff(title, categories)
            print(f"Title: {title}\nPredicted Category: {category}\n")
        else:
            print("No title found for an article.")
except FileNotFoundError:
    print(f"File not found: {json_file_path}")
except json.JSONDecodeError:
    print("Error decoding JSON from the file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Title: भदौ २३ र २४ मा भएका घटनामा प्रहरीले पाइसक्यो '२५ हजार बढी प्रमाण', 'अपराधलाई राजनीतिक अर्थ नलगाइयोस्'
Predicted Category: अपराध

Model overloaded. Retrying in 1.09 seconds...
Title: तोकिएकै मितिमा चुनाव गराउन कति सम्भव? निर्वाचन आयोगका चुनौतीहरूको लेखाजोखा
Predicted Category: राजनीति

Model overloaded. Retrying in 1.62 seconds...
Title: 'जेन जी' र 'नेपो बेबिज' को हुन्, नेपालमा दुई दिनमै कसरी उथलपुथल भयो
Predicted Category: राजनीति

Model overloaded. Retrying in 1.23 seconds...
Title: जनार्दनले मागे प्रचण्डको राजीनामा
Predicted Category: राजनीति

Title: दलाई लामापछि केन्द्रीय तिब्बती प्रशासन प्रमुखको प्रधानमन्त्री कार्कीलाई बधाई
Predicted Category: राजनीति

Model overloaded. Retrying in 1.71 seconds...
Model overloaded. Retrying in 2.44 seconds...
Model overloaded. Retrying in 4.41 seconds...
Model overloaded. Retrying in 8.79 seconds...
Title: 'फरएभर केमिकल्स': ठूलो वातावरणीय सङ्कट ल्याउने यी हानीकारक रसायन के हुन्?  
Predicted Category: विज्ञान

Title: तत्काल विशेष महाधिवेशन र 

In [2]:
import os
import json
import time
import random
from dotenv import load_dotenv
from google import genai
from google.genai import types

# Load environment variables
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not set in .env file")

# Initialize the Gemini client
client = genai.Client(api_key=api_key)

# Define the model to use
model_name = "models/gemini-1.5-flash"  # Flash model for lower latency

# Define sentiment labels
sentiment_labels = ["Positive", "Negative", "Neutral"]

# Path to the JSON file
json_file_path = r"D:\.A Internship Documents\Project G\cleaned_data\cleaned_articles_20250916_173228.json"

# Function to perform sentiment analysis with retry
def analyze_sentiment_with_backoff(text, labels, max_retries=5):
    labels_str = ", ".join(labels)
    prompt = f"Determine the sentiment of the following Nepali text. Choose one of: {labels_str}.\n\nText: \"{text}\"\n\nSentiment:"
    
    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model=model_name,
                contents=prompt,
                config=types.GenerateContentConfig(
                    temperature=0.0,
                    max_output_tokens=50
                )
            )
            return response.text.strip()
        except Exception as e:
            if "503" in str(e) or "model is overloaded" in str(e).lower():
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                print(f"Model overloaded. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                raise
    return "Error: Max retries reached due to model overload."

# Read and process the JSON file
try:
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    articles = data.get("articles", [])
    if not articles:
        print("No articles found in the JSON file.")
    
    for article in articles:
        text = article.get("original_title") or article.get("content")  # Prefer content if available
        if text:
            sentiment = analyze_sentiment_with_backoff(text, sentiment_labels)
            print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")
        else:
            print("No text found for an article.")
except FileNotFoundError:
    print(f"File not found: {json_file_path}")
except json.JSONDecodeError:
    print("Error decoding JSON from the file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Text: भदौ २३ र २४ मा भएका घटनामा प्रहरीले पाइसक्यो '२५ हजार बढी प्रमाण', 'अपराधलाई राजनीतिक अर्थ नलगाइयोस्'
Predicted Sentiment: Neutral.

The text reports on events and a police investigation, presenting facts without expressing a positive or negative opinion.  The quote urging against politicizing the crime is a statement of neutrality, not a positive or negative sentiment itself.

Text: तोकिएकै मितिमा चुनाव गराउन कति सम्भव? निर्वाचन आयोगका चुनौतीहरूको लेखाजोखा
Predicted Sentiment: Neutral.

The text is a news headline or title, posing a question about the feasibility of holding elections on the scheduled date.  It doesn't express a positive or negative opinion, but rather presents a factual situation and a potential problem.

Text: 'जेन जी' र 'नेपो बेबिज' को हुन्, नेपालमा दुई दिनमै कसरी उथलपुथल भयो
Predicted Sentiment: Neutral.

The text is a news headline-style question, not expressing an opinion.  It simply asks about an event ("Jen Z" and "Nepo Babies" causing upheaval in Nepal).