<a href="https://colab.research.google.com/github/Hicham-Yezza/Hicham-Yezza/blob/main/IAEA_Volt_Procssor_V9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IAEA Volt processing script -- Updated with Country and Language Columns
# Hicham Yezza -- Nov 2024

In [None]:
!pip install pandas
!pip install deep-translator
!pip install tqdm
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install nltk

import pandas as pd
from deep_translator import GoogleTranslator
from tqdm import tqdm
from datetime import datetime
from google.colab import files
from concurrent.futures import ThreadPoolExecutor
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# nltk stopwords and tokenizer data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Initialize stopwords
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
# Download 'punkt_tab' if not found
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

# Initialize tqdm for progress monitoring
tqdm.pandas()

# Prompt the user to upload files
print("Please upload the CSV or Excel files containing the mentions data.")
uploaded_files = files.upload()

# Prompt the user to upload the channel list sheet
print("Please upload the Excel file containing the channel list data.")
channel_list_file = files.upload()
channel_data_path = list(channel_list_file.keys())[0]
channel_data = pd.read_excel(channel_data_path)

# Create a dictionary for fast lookup of country and language based on channel source code
channel_mapping = channel_data.set_index('Channel source code')[['Language', 'Country']].to_dict('index')

# Initialize NLTK stopwords for filtering irrelevant words
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# Load spaCy's small English model for NER
nlp = spacy.load("en_core_web_sm")

# Define a strict mapping between Arabic variants and the 15 English keywords
mapping_dict = {
    # Nuclear fuel variations
    "الوقود النووي": "Nuclear fuel",
    "بالوقود النووي": "Nuclear fuel",
    "الوقود النووية": "Nuclear fuel",
    "الوقود": "Nuclear fuel",
    "للوقود": "Nuclear fuel",
    "الوقود المستهلكة": "Nuclear fuel",
    "الوقود والمستهلكات": "Nuclear fuel",

    # Nuclear reactor variations
    "مفاعل نووي": "Nuclear reactor",
    "مفاعل نووية": "Nuclear reactor",
    "المفاعلات النووية": "Nuclear reactor",
    "للمفاعلات النووية": "Nuclear reactor",
    "المفاعل النووي": "Nuclear reactor",
    "المفاعل النووية": "Nuclear reactor",
    "مفاعلات نووية": "Nuclear reactor",
    "مفاعلها النووية": "Nuclear reactor",
    "والمفاعلات النووية": "Nuclear reactor",
    "بالمفاعلات النووية": "Nuclear reactor",
    "للمفاعل النووي": "Nuclear reactor",
    "بالمفاعل النووي": "Nuclear reactor",
    "ومفاعلات نووية": "Nuclear reactor",
    "مفاعلين نوويين": "Nuclear reactor",
    "والمفاعل النووي": "Nuclear reactor",
    "والمفاعل النووية": "Nuclear reactor",
    "مفاعلاتها النووية": "Nuclear reactor",
    "المفاعلات النوويه": "Nuclear reactor",
    "مفاعلها النووي": "Nuclear reactor",
    "مفاعل النووي": "Nuclear reactor",
    "للمفاعل النووية": "Nuclear reactor",
    "مفاعلات النووية": "Nuclear reactor",
    "والمفاعلاتها النووية": "Nuclear reactor",
    "والمفاعلات النوويه": "Nuclear reactor",
    "المفاعل": "Nuclear reactor",

    # Nuclear power plant variations
    "محطة للطاقة النووية": "Nuclear power plant",
    "محطات الطاقة النووية": "Nuclear power plant",
    "محطة طاقة نووية": "Nuclear power plant",
    "محطات طاقة نووية": "Nuclear power plant",
    "محطات الطاقه النوويه": "Nuclear power plant",
    "المحطات الطاقة النووية": "Nuclear power plant",
    "المحطة للطاقة النووية": "Nuclear power plant",
    "محطة الطاقة النووية": "Nuclear power plant",
    "محطه الطاقه النوويه": "Nuclear power plant",
    "محطات للطاقة النووية": "Nuclear power plant",
    "محطة الطاقة": "Nuclear power plant",

    # Uranium variations
    "اليورانيوم": "Uranium",
    "لليورانيوم": "Uranium",
    "باليورانيوم": "Uranium",
    "واليورانيوم": "Uranium",
    "يورانيوم": "Uranium",
    "اليورانيوم اليورانيوم": "Uranium",
    "يورانيوم يورانيوم": "Uranium",
    "اليورانيومي": "Uranium",
    "كاليورانيوم": "Uranium",

    # Plutonium variations
    "البلوتونيوم": "Plutonium",
    "بلوتونيوم": "Plutonium",
    "البلوتونيوم واليورانيوم": "Plutonium and Uranium",
    "بالبلوتونيوم": "Plutonium",

    # Remaining terms
    "جهاز الطرد المركزي الغازي": "Gas centrifuge",
    "فصل النظائر": "Isotopic separation",
    "إعادة المعالجة": "Reprocessing",
    "الثوريوم": "Thorium",
    "التريتيوم": "Tritium",
    "الماء الثقيل": "Heavy water",
    "الكعكة الصفراء": "Yellowcake",
    "مفاعل صغير نمطي": "Small modular reactor",
    "النووي": "Nuclear",
    "بالثوريوم": "Thorium",
    "نووية": "Nuclear"
}

# Initialize the GoogleTranslator from deep_translator
translator = GoogleTranslator(source='auto', target='en')

# Function to translate Arabic snippets into English with tqdm progress bar
def translate_snippet(snippet):
    try:
        return translator.translate(snippet)
    except Exception as e:
        return f"Translation error: {str(e)}"

# Function to extract entities from English snippets
def extract_entities(snippet):
    try:
        doc = nlp(snippet)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        return entities if entities else "No entities found"
    except Exception as e:
        return f"NER error: {str(e)}"

# Function to extract key themes based on multiple columns
def extract_key_theme(row):
    military_keywords = ['weapon', 'missile', 'military', 'defense', 'war', 'attack', 'strike']
    energy_keywords = ['energy', 'reactor', 'power', 'electricity', 'fuel', 'generation']
    diplomacy_keywords = ['diplomacy', 'negotiation', 'treaty', 'sanctions', 'agreement', 'peace']
    safety_keywords = ['safety', 'accident', 'radiation', 'hazard', 'security', 'protocol']
    technology_keywords = ['technology', 'innovation', 'infrastructure', 'development', 'research']

    combined_text = f"{row['Mention_English']} {row['English_Snippet']} {row['Key_Entities']}"

    if any(keyword in combined_text.lower() for keyword in military_keywords):
        return 'Nuclear Military'
    elif any(keyword in combined_text.lower() for keyword in energy_keywords):
        return 'Nuclear Energy'
    elif any(keyword in combined_text.lower() for keyword in diplomacy_keywords):
        return 'Nuclear Diplomacy'
    elif any(keyword in combined_text.lower() for keyword in safety_keywords):
        return 'Nuclear Safety'
    elif any(keyword in combined_text.lower() for keyword in technology_keywords):
        return 'Nuclear Technology'
    else:
        return 'General Nuclear Theme'

# Function to extract and prioritize keywords, limited to 10 maximum
def extract_keywords(row):
    # Combine relevant columns, excluding 'Text search' as it contains Arabic
    combined_text = f"{row['Mention_English']} {row['English_Snippet']} {row['Key_Entities']} {row['Key_Theme']}"

    # Tokenize the combined text and filter out non-alphanumeric tokens and stopwords
    try:
        tokens = word_tokenize(combined_text.lower())
    except LookupError:
        nltk.download('punkt')
        tokens = word_tokenize(combined_text.lower())

    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Count word frequency in the combined text
    token_counter = Counter(filtered_tokens)

    # Prioritize named entities by boosting their frequency
    if isinstance(row['Key_Entities'], list):
        for ent in row['Key_Entities']:
            if all(c.isalpha() and c.isascii() for c in ent[0]):
                token_counter[ent[0].lower()] += 5  # Boost entity importance

    # Get the top 10 keywords based on frequency
    top_keywords = [word for word, _ in token_counter.most_common(10)]

    return top_keywords

# Load and process each uploaded file
for filename in uploaded_files.keys():
    # Determine if the file is CSV or Excel
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
    elif filename.endswith(('.xlsx', '.xls')):
        df = pd.read_excel(filename)
    else:
        print(f"Skipping unsupported file type: {filename}")
        continue

    # Apply the strict mapping dictionary to replace Arabic phrases with corresponding English keywords
    df['Mention_English'] = df['Mention'].map(mapping_dict).fillna(df['Mention'])

    # Translate the snippets into English
    df['English_Snippet'] = df['Mention in transcript snippet'].progress_apply(translate_snippet)

    # Extract entities from English snippets
    df['Key_Entities'] = df['English_Snippet'].progress_apply(extract_entities)

    # Extract key themes from each row
    df['Key_Theme'] = df.progress_apply(extract_key_theme, axis=1)

    # Extract and prioritize keywords from each row
    df['Key_Words'] = df.progress_apply(extract_keywords, axis=1)

    # Add Country and Language columns based on channel source code
    df['Country'] = df['Channel source code'].map(lambda x: channel_mapping.get(x, {}).get('Country', 'Unknown'))
    df['Language'] = df['Channel source code'].map(lambda x: channel_mapping.get(x, {}).get('Language', 'Unknown'))

    # Save the processed DataFrame to a new CSV file
    output_filename = f"processed_{filename}"
    df.to_csv(output_filename, index=False)
    print(f"Processed data saved to {output_filename}")

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart ker

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
