In [1]:
import os
from indicnlp import common
from indicnlp.tokenize import indic_tokenize

# Set up resource path
INDIC_NLP_RESOURCES = "D:/A Internship Documents/Project G/indic_nlp_resources"
common.set_resources_path(INDIC_NLP_RESOURCES)

# Tokenize
text = "नेपालको राजधानी काठमाडौं हो, जहाँ धेरै सांस्कृतिक स्थलहरू छन्। विद्यार्थीहरूले पुस्तकहरू पढ्न र अनुसन्धान गर्न मन पराउँछन्।"
tokens = list(indic_tokenize.trivial_tokenize(text, lang='ne'))
print(tokens)

['नेपालको', 'राजधानी', 'काठमाडौं', 'हो', ',', 'जहाँ', 'धेरै', 'सांस्कृतिक', 'स्थलहरू', 'छन्', '।', 'विद्यार्थीहरूले', 'पुस्तकहरू', 'पढ्न', 'र', 'अनुसन्धान', 'गर्न', 'मन', 'पराउँछन्', '।']


In [1]:
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv()  # Load from .env

try:
    genai.configure(api_key=os.environ["GEMINI_API_KEY"])
except KeyError:
    print("Error: The 'GEMINI_API_KEY' environment variable is not set.")
    print("Please set your API key before running the script.")
    exit()


  from .autonotebook import tqdm as notebook_tqdm


Model: models/embedding-gecko-001 has methods: ['embedText', 'countTextTokens']
Model: models/gemini-1.5-pro-latest has methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-pro-002 has methods: ['generateContent', 'countTokens', 'createCachedContent']
Model: models/gemini-1.5-pro has methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-flash-latest has methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-flash has methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-flash-002 has methods: ['generateContent', 'countTokens', 'createCachedContent']
Model: models/gemini-1.5-flash-8b has methods: ['createCachedContent', 'generateContent', 'countTokens']
Model: models/gemini-1.5-flash-8b-001 has methods: ['createCachedContent', 'generateContent', 'countTokens']
Model: models/gemini-1.5-flash-8b-latest has methods: ['createCachedContent', 'generateContent', 'countTokens']
Model: models/gemini-2.5-pro-preview-03-25 has methods: ['

In [None]:
import os
import json
from dotenv import load_dotenv

from google import genai
from google.genai import types

def zero_shot_classification(client, text_to_classify, labels, model_name):
    labels_str = ", ".join(labels)
    prompt = (
        f"Classify the following Nepali text into one of these categories: {labels_str}.\n\n"
        f"Text: \"{text_to_classify}\"\n\n"
        "Category:"
    )
    try:
        resp = client.models.generate_content(
            model=model_name,
            contents=prompt,
            config=types.GenerateContentConfig(
                temperature=0.0,           # you can adjust
                max_output_tokens=50       # adjust as needed
            )
        )
        return resp.text.strip()
    except Exception as e:
        return f"Error in zero_shot_classification: {type(e).__name__}: {e}"

def process_and_classify_files(client, folder_path, categories, model_name):
    if not os.path.isdir(folder_path):
        print(f"Error: Folder not found: {folder_path}")
        return

    file_list = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    if not file_list:
        print(f"No JSON files found in the folder: {folder_path}")
        return

    print(f"Found {len(file_list)} JSON files to process.")

    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        print(f"\n--- Processing file: {file_name} ---")
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Error: Failed to decode JSON from {file_name}: {e}")
            continue
        except Exception as e:
            print(f"Error: Could not read file {file_name}: {type(e).__name__}: {e}")
            continue

        articles = data.get("articles")
        if not isinstance(articles, list):
            print(f"Skipping {file_name}: 'articles' key not found or is not a list.")
            continue

        for article in articles:
            original_id = article.get("original_id", "<no_id>")
            original_title = article.get("original_title", "")
            if not original_title:
                print(f"Article ID {original_id}: No title, skipping.")
                continue

            predicted = zero_shot_classification(client, original_title, categories, model_name)
            print(f"Article ID: {original_id}")
            print(f"Title: {original_title}")
            print(f"Predicted Category: {predicted}\n")


def main():
    load_dotenv()
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        print("Error: GEMINI_API_KEY not set")
        return

    client = genai.Client(api_key=api_key)

    # Pick a working model from your list
    model_name = "models/gemini-1.5-pro-latest"

    # If you want, check it supports generateContent
    # (You already printed that so it's fine.)

    categories = ["राजनीति", "मनोरञ्जन", "खेलकुद", "अपराध", "स्वास्थ्य", "विज्ञान"]
    folder = "cleaned_data"

    process_and_classify_files(client, folder, categories, model_name)


if __name__ == "__main__":
    main()


Found 12 JSON files to process.

--- Processing file: cleaned_articles_20250916_173228.json ---
Article ID: article_1
Title: भदौ २३ र २४ मा भएका घटनामा प्रहरीले पाइसक्यो '२५ हजार बढी प्रमाण', 'अपराधलाई राजनीतिक अर्थ नलगाइयोस्'
Predicted Category: Error in zero_shot_classification: ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_token_count', 'quotaId': 'GenerateContentInputTokensPerModelPerMinute-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-1.5-pro'}}, {'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsP