<a href="https://colab.research.google.com/github/Joykw1/NLP_RAG_project/blob/main/Code/Language_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
input_file = "/content/multilingual_qa_results_en_all.csv"

In [None]:
import re

def extract_answers(row):
    """
    Extract text from original_answer dictionary and model_answer text from a single row.

    Parameters:
    row: A single row from the dataframe (either as Series or dict-like object)

    Returns:
    tuple: (original_answer_text, model_answer) as strings
    """
    # Extract original answer text using regex
    original_answer = str(row['original_answer'])
    match = re.search(r"'text':\s*\['([^']+)'\]", original_answer)
    original_answer_text = match.group(1) if match else ""

    # Extract model answer directly
    model_answer = str(row['model_answer'])

    return original_answer_text, model_answer



In [None]:
def is_substring(original_answer_text, model_answer):
    """
    Check if the original_answer_text is a substring of model_answer after preprocessing.

    Preprocessing steps:
    1) Remove all characters that are not letters or numbers
    2) Convert to lowercase

    Parameters:
    original_answer_text (str): The original answer text
    model_answer (str): The model's answer text

    Returns:
    bool: True if preprocessed original_answer_text is a substring of preprocessed model_answer
    """
    # Preprocessing step 1: Remove all non-alphanumeric characters
    clean_original = re.sub(r'[^a-zA-Z0-9а-яА-ЯёЁ]', '', original_answer_text)
    clean_model = re.sub(r'[^a-zA-Z0-9а-яА-ЯёЁ]', '', model_answer)

    # Preprocessing step 2: Convert to lowercase
    clean_original = clean_original.lower()
    clean_model = clean_model.lower()

    # Check if clean_original is a substring of clean_model
    return clean_original in clean_model

In [None]:
def contains_letters(model_answer):
    """
    Check if the model_answer contains any letters.

    Parameters:
    model_answer (str): The model's answer text

    Returns:
    bool: True if the model_answer contains at least one letter, False otherwise
    """
    # Search for any letter (a-z or A-Z) in the string
    return bool(re.search('[a-zA-Zа-яА-ЯёЁ]', str(model_answer)))

In [None]:
!pip install langdetect

In [None]:
from langdetect import detect

def get_language_code(model_answer):
    """
    Detect the language of the model_answer and return its language code.

    Parameters:
    model_answer (str): The model's answer text

    Returns:
    str: ISO 639-1 language code (e.g., 'en' for English, 'ru' for Russian)
         Returns 'unknown' if detection fails
    """

    # Use langdetect with default return value if it fails
    return detect(model_answer) if len(model_answer.strip()) >= 3 else 'unknown'

In [None]:
from transformers import pipeline

# Load the language detection model once (outside the function)
# This is efficient as it avoids reloading the model for each detection
language_classifier = pipeline("text-classification",
                              model="papluca/xlm-roberta-base-language-detection")

def get_language_code_hf(model_answer):
    """
    Detect the language of the model_answer using a Hugging Face model.

    Parameters:
    model_answer (str): The model's answer text

    Returns:
    str: ISO 639-1 language code (e.g., 'en' for English, 'ru' for Russian etc)
         Returns 'unknown' for empty or invalid inputs
    """
    # Get prediction from the model
    result = language_classifier(model_answer)

    # Extract the predicted language code (label)
    return result[0]['label']

In [None]:
!pip install spacy_langdetect

In [None]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

# Load and configure spaCy with language detector once (outside the function)
# This only needs to be done once at the beginning of your script
nlp = spacy.load("en_core_web_sm")  # Load a small model as base

# Define factory for language detection
@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector()

# Add the language detector to the pipeline
nlp.add_pipe("language_detector", last=True)

def get_language_code_spacy(model_answer):
    """
    Detect the language of the model_answer using spaCy and spacy_langdetect.

    Parameters:
    model_answer (str): The model's answer text

    Returns:
    str: ISO 639-1 language code (e.g., 'en' for English, 'ru' for Russian)
         Returns 'unknown' for empty or invalid inputs
    """
    # Process the text with spaCy
    doc = nlp(model_answer)

    # Get the detected language
    language = doc._.language

    # Return the language code
    return language['language']

In [None]:
def get_majority_language(hf_lang_code, langdetect_code, spacy_lang_code):
    """
    Determine the most common language code among three language detection methods.

    Parameters:
    hf_lang_code (str): Language code from Hugging Face model
    langdetect_code (str): Language code from langdetect library
    spacy_lang_code (str): Language code from spaCy detection

    Returns:
    str: The majority language code, or the first language code if all are different
    """
    # Count occurrences of each language code
    lang_codes = [hf_lang_code, langdetect_code, spacy_lang_code]

    # Handle 'unknown' values
    valid_codes = [code for code in lang_codes if code != 'unknown']

    # If all are unknown, return unknown
    if not valid_codes:
        return 'unknown'

    # Count occurrences of each code
    code_counts = {}
    for code in valid_codes:
        if code in code_counts:
            code_counts[code] += 1
        else:
            code_counts[code] = 1

    # Find the code with the highest count
    majority_code = max(code_counts.items(), key=lambda x: x[1])

    # Return the majority code
    return majority_code[0]

In [None]:
import pandas as pd
from transformers import pipeline


def evaluate_language_answers(question_lang, input_csv_path, output_csv_path=None):
    """
    Process each row in a CSV file, evaluating answers and language detection.

    Parameters:
    question_lang (str): The language of the question
    input_csv_path (str): Path to the input CSV file
    output_csv_path (str, optional): Path for the output CSV file.
                                    If None, constructs path by adding 'lang_detected_' prefix

    Returns:
    pandas.DataFrame: The processed dataframe with added evaluation columns
    """
    # Load the CSV file
    df = pd.read_csv(input_csv_path)

    # Create output path if not provided
    if output_csv_path is None:
        # Split the path to extract the filename
        parts = input_csv_path.split('/')
        filename = parts[-1]
        # Create output path with prefix
        output_csv_path = '/'.join(parts[:-1] + ['lang_detected_' + filename]) if len(parts) > 1 else 'lang_detected_' + filename

    # Create new columns for our analysis
    df['extracted_original_answer'] = ''
    df['has_letters'] = False
    df['is_substring'] = False
    df['langdetect_code'] = ''
    df['hf_lang_code'] = ''
    df['spacy_lang_code'] = ''
    df['majority_code'] = ''

    # Process each row
    for index, row in df.iterrows():
        # 1) Extract original answer and model answer
        original_text, model_text = extract_answers(row)
        df.at[index, 'extracted_original_answer'] = original_text

        # 2) Check if model answer has letters
        has_letters = contains_letters(model_text)
        df.at[index, 'has_letters'] = has_letters

        # 3) Check if original answer is substring of model answer
        is_substring_result = is_substring(original_text, model_text)
        df.at[index, 'is_substring'] = is_substring_result

        # 4) If model answer has letters, detect language
        if is_substring_result:
          df.at[index, 'majority_code'] = question_lang
        elif has_letters and not is_substring_result:
            # Use langdetect
            lang_code = get_language_code(model_text)
            df.at[index, 'langdetect_code'] = lang_code

            # Use Hugging Face model
            hf_lang_code = get_language_code_hf(model_text)
            df.at[index, 'hf_lang_code'] = hf_lang_code

            # Use spaCy
            spacy_lang_code = get_language_code_spacy(model_text)
            df.at[index, 'spacy_lang_code'] = spacy_lang_code

            # Determine the majority language
            majority_code = get_majority_language(hf_lang_code, lang_code, spacy_lang_code)
            df.at[index, 'majority_code'] = majority_code
        elif is_substring:
          df.at[index, 'majority_code'] = question_lang
        else:
          df.at[index, 'majority_code'] = 'no_lang'

    # Save the results to a new CSV file
    df.to_csv(output_csv_path, index=False)

    # Return the processed dataframe
    return df


In [None]:
a = evaluate_language_answers("en", input_file)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import os

# from google.colab import drive
# drive.mount('/content/drive')

def extraction_filename (csv_path):
  # Extract filename without path
  filename = os.path.basename(csv_path)  # "lang_detected_multilingual_qa_results_en_all.csv"

  # Remove extension and split by `_`
  parts = filename.replace('.csv', '').split('_')

  print(f"Parts {parts}")

  # Extract the last part (i.e., "en_all")
  if "all" in parts[-1]:return '_'.join(parts[-2:])
  else: return '_'.join(parts[-3:])



def extraction_plot_titles(csv_path, language_map):
  # Remove extension and split by `_`
  parts = csv_path.replace('.csv', '').split('_')

  print(f"Parts {parts}")


  # Extract the last part (i.e., "en_all")
  if "all" in parts[-1]:
    return f'Question: {language_map[parts[-2]]}, Context: English, German and Russian'
  else:
    return f'Question: {language_map[parts[-3]]}, Context: {language_map[parts[-2]]} and {language_map[parts[-1]]}'



def visualize_language_distribution(csv_path, output_prefix="language_viz"):
    """
    Visualize language distribution and substring matching from the dataset.

    Parameters:
    csv_path (str): Path to the CSV file with language detection results
    output_prefix (str): Prefix for output image filenames
    """

    
    # Read the CSV file
    df = pd.read_csv(csv_path)

    # Convert boolean columns if they're stored as strings
    if df['is_substring'].dtype == 'object':
        df['is_substring'] = df['is_substring'].map({'True': True, 'False': False})
    if df['has_letters'].dtype == 'object':
        df['has_letters'] = df['has_letters'].map({'True': True, 'False': False})

    # Create a language mapping for better readability
    language_map = {
        'en': 'English', 'de': 'German', 'ru': 'Russian', 'fr': 'French',
        'it': 'Italian', 'es': 'Spanish', 'pt': 'Portuguese', 'da': 'Danish',
        'so': 'Somali', 'ca': 'Catalan', 'sw': 'Swahili', 'ro': 'Romanian',
        'pl': 'Polish', 'no': 'Norwegian', 'sv': 'Swedish', 'vi': 'Vietnamese',
        'fi': 'Finnish', 'id': 'Indonesian', 'tl': 'Tagalog', 'et': 'Estonian',
        'sl': 'Slovenian', 'nl': 'Dutch', 'cs': 'Czech', 'af': 'Afrikaans',
        'ur': 'Urdu', 'lt': 'Lithuanian', 'lv': 'Latvian', 'hr': 'Croatian',
        'bg': 'Bulgarian', 'mk': 'Macedonian', 'uk': 'Ukrainian', 'tr' : 'Turkish'
    }

    # Replace language codes with full names where possible
    df['language_name'] = df['majority_code'].map(language_map).fillna(df['majority_code'])

    # Count language occurrences and sort by frequency
    lang_counts = df['language_name'].value_counts()

    # Calculate substring matching by language
    lang_substring = df.groupby(['language_name', 'is_substring']).size().unstack(fill_value=0)
    if True not in lang_substring.columns:
        lang_substring[True] = 0
    if False not in lang_substring.columns:
        lang_substring[False] = 0

    lang_substring = lang_substring.reset_index()
    lang_substring.columns = ['Language', 'No Match', 'Match']
    lang_substring['Total'] = lang_substring['Match'] + lang_substring['No Match']
    lang_substring['Match Rate'] = (lang_substring['Match'] / lang_substring['Total'] * 100).round(2)

    # Sort by total count
    lang_substring = lang_substring.sort_values('Total', ascending=False)

    # Print summary statistics
    print("=== Summary Statistics ===")
    print(f"Total languages: {len(lang_counts)}")
    print(f"Total answers: {len(df)}")
    print(f"Substring matches: {df['is_substring'].sum()} ({df['is_substring'].mean()*100:.2f}%)")
    print(f"No matches: {len(df) - df['is_substring'].sum()}")
    print("\n=== Top 10 Languages ===")
    print(lang_substring[['Language', 'Total', 'Match', 'No Match', 'Match Rate']].head(10).to_string(index=False))

    ### Visualizations
    
    #output_folder = "/content/drive/My Drive/NLP_final_project_lang_detected/"
    output_prefix = extraction_filename(csv_path)
    title_lang = extraction_plot_titles(csv_path, language_map)
    

    ## 1. Bar chart of top languages
    plt.figure(figsize=(14, 8))
    top_n = 15  # Show top N languages

    # Get top N languages
    top_langs = lang_substring.head(top_n)

    # Create stacked bar chart
    ax = plt.subplot(111)
    top_langs.plot(x='Language', y=['Match', 'No Match'], kind='bar', stacked=True,
                  color=['#4CAF50', '#F44336'], ax=ax)

    # Add labels and title
    plt.title(f'Top Languages by Frequency and Substring Matching. {title_lang}.', fontsize=16)
    plt.xlabel('Language', fontsize=14)
    plt.ylabel('Number of Answers', fontsize=14)
    plt.xticks(rotation=45, ha='right')

    # Add totals on top of bars
    for i, total in enumerate(top_langs['Total']):
        plt.text(i, total + 0.5, str(total), ha='center', fontsize=10, fontweight='bold')

    # Tight layout and save figure
    plt.tight_layout()
    plt.savefig(f"{output_prefix}_bar_chart.png", dpi=300)
    #plt.savefig(os.path.join(output_folder, f"{output_prefix}_bar_chart.png"), dpi=300)



    ## 2. Pie chart of language distribution
    plt.figure(figsize=(12, 12))

    # Group smaller languages as "Other"
    threshold = 5  # Languages with fewer than this count will be grouped
    other_langs = lang_counts[lang_counts < threshold].sum()
    major_langs = lang_counts[lang_counts >= threshold]

    if other_langs > 0:
        pie_data = pd.concat([major_langs, pd.Series({'Other': other_langs})])
    else:
        pie_data = major_langs


    # Prepare the data for Plotly
    pie_data_df = pd.DataFrame(pie_data).reset_index()
    pie_data_df.columns = ['Language', 'Count']

    # Create a pie chart using Plotly Express
    fig = px.pie(pie_data_df,
                values='Count',
                names='Language',
                title='Distribution of Languages',
                hover_data={'Language': True, 'Count': True},
                labels={'Count': 'Count'},  # Rename "Count" label in the legend
                hole=0.3)  # This creates a "donut" chart if needed

    # Customize the pie chart to show percentages inside, labels outside, and add legend with counts
    fig.update_traces(textposition='outside',
                      textinfo='percent+label',
                      texttemplate='%{label} %{percent:.1%}',  # Round percentages to whole numbers
                      hoverinfo='label+percent+value',
                      marker=dict(line=dict(color='white', width=2)),  # White border between slices
                      showlegend=True)

    # Customizing legend labels to include the count next to the language
    legend_labels = {label: f"{label} ({count})" for label, count in zip(pie_data_df['Language'], pie_data_df['Count'])}
    print(fig.data)

    # Updating the labels in the figure
    fig.data[0].labels = [legend_labels[label] for label in fig.data[0].labels]



    # Adjust the layout for the title and legend
    fig.update_layout(
        title=f'Distribution of Languages.\n{title_lang}',  # Change title as needed
        showlegend=True,
        legend_title='Languages (Count)',
        width=1000,  # Set figure width
        height=1000,  # Set figure height
        title_x=0.5,  # Center the title (0 = left, 1 = right)
        title_y=0.05  # Move title closer or further from the plot
    )

    # Save the image as html file
    #fig.write_html(os.path.join(output_folder, f"{output_prefix}_pie_chart_plotly.html"))
    fig.write_html(f"{output_prefix}_pie_chart_plotly.html")

    

    # Show the plot
    fig.show()




    # Show summary message
    print(f"\nVisualization complete! Files saved with prefix '{output_prefix}'")
    return lang_substring

# Example usage:
language_data = visualize_language_distribution('lang_detected_multilingual_qa_results_en_ru_de.csv')