In [None]:
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from plotly.subplots import make_subplots
import os
from transformers import AutoTokenizer
from tqdm import tqdm
import plotly.graph_objects as go
from typing import List, Union, Any
from instructor import Instructor
# various utils for distilling features 
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Awaitable
import instructor
import asyncio
from typing import AsyncGenerator
from tqdm.asyncio import tqdm as async_tqdm

import nest_asyncio
from openai import AsyncOpenAI


nest_asyncio.apply()

In [None]:
ds = load_dataset("coastalcph/tydi_xor_rc")
ds_val = ds["validation"].to_pandas()
ds_train = ds["train"].to_pandas()
print(len(ds_train))
print(len(ds_val))

In [None]:


class TranslationResponse(BaseModel):
    original_text : str = Field(description="The original text that was translated")
    translated_text : str = Field(description="The translated text into English from either Finnish, Japanese or Russian")
    
async def embed_chunk(chunk : List[str]) -> Awaitable[List[float]]:
    client = AsyncOpenAI()
    response = client.embeddings.create(input=chunk, model='text-embedding-ada-002')
    return [r.embedding for r in response.data]

class TranslationResponse(BaseModel):
    original_text : str = Field(description="The original text that was translated")
    translated_text : str = Field(description="The translated text into English from either Finnish, Japanese or Russian")
    
async def embed_chunk(chunk : List[str]) -> Awaitable[List[float]]:
    client = AsyncOpenAI()
    response = client.embeddings.create(input=chunk, model='text-embedding-ada-002')
    return [r.embedding for r in response.data]

async def translate_chunk(
    chunk: List[tuple[str, str]]
) -> List[TranslationResponse]:
    client = instructor.from_openai(AsyncOpenAI())

    text_chunks = '\n'.join([f'lang: {t[0]} \n text: {t[1]}' for t in chunk])
    response = await client.chat.completions.create(
        model="gpt-4-0125-preview",
        response_model=List[TranslationResponse],
        messages=[
            {
                "role": "system",
                "content": "You are a translation assistant that translates text from Finnish, Japanese or Russian to English. Please output a list of TranslationResponse objects ordered by the original input list."
            },
            {
                "role": "user",
                "content": f"Translate the following text to English: {text_chunks}"
            }
        ]
    )
    assert len(response) == len(chunk), f"Response length ({len(response)}) does not match chunk length ({len(chunk)})"
    return response

async def translate_questions(df: pd.DataFrame):
    batch_size = 5

    if df.empty:
        print("No rows to translate.")
        return df
    
    # Initialize the question_translated column for all rows
    df["question_translated"] = ""
    
    async def process_batch(task_id : int, batch) -> Awaitable[List[TranslationResponse]]:
        translations = await translate_chunk(
            [(row["lang"], row["question"]) for _, row in batch.iterrows()]
        )
        return task_id, translations
    
    
    batches = [df.iloc[i:i+batch_size] for i in range(0, len(df), batch_size)]
    
    all_translations = []
    
    tasks = [process_batch(task_id=i, batch=batch) for i, batch in enumerate(batches)]
    
    async for translation in async_tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Translating"):
        all_translations.append(await translation)

    all_translations.sort(key=lambda x: x[0])
    
    flattened_translations = [item for sublist in all_translations for item in sublist[1]]
    # Update only the rows that needed translation
    for i, translation in enumerate(flattened_translations):
        df.loc[df.index[i], "question_translated"] = translation.translated_text
    
    return df


# Wrap the translation process in a try-except block
try:
    translated_df = asyncio.run(translate_questions(ds_train[:10]))
    current_dir = os.getcwd()
    if current_dir.endswith("week1"):
        os.chdir("..")
    else:
        print("current dir", current_dir)
    os.makedirs("dataset", exist_ok=True)
    translated_df.to_parquet("dataset/translated_df_train.parquet", index=False)
    os.chdir(current_dir)
    print("Translation completed successfully.")
except Exception as e:
    print(f"An error occurred during translation: {str(e)}")





In [None]:

data = [(_dict["lang"], _dict["question"]) for _dict in ds_train[['lang', 'question']].to_dict(orient='records')]
print(data)




""" import os
import pandas as pd
from pandasai import Agent """

print(ds_train.columns)

In [None]:
#tokenize 
#we load the llama3 tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")

In [None]:
def tokenize_column(
    df : pd.DataFrame, 
    tokenizer : AutoTokenizer,
    tokenizer_name : str
):
    text_cols = ['question', 'context', 'answer']
    for col in tqdm(text_cols):
        df[f"{col}_{tokenizer_name}_n_tokens"] = df[col].apply(lambda x: len(tokenizer.encode(x)))
    return df

In [None]:
#Summarize basic data statistics for train-
#ing and validation data in each of the languages Finnish (fi), Japanese
#(ja) and Russian (ru).

# distribution of answerable and not answerable questions
def create_language_plots(train_data, val_data, title_prefix):
    # Create subplots
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                        subplot_titles=(f"{title_prefix} - Training Data", 
                                        f"{title_prefix} - Validation Data"))

    # Only fi, ja, ru
    selected_langs = ['fi', 'ja', 'ru']
    dfs = [train_data, val_data]
    
    figs = []
    for df in dfs:
        selected_data = df[df['lang'].isin(selected_langs)]
        fig = px.sunburst(selected_data, path=['lang', 'answerable'],
                       color='lang',
                       color_discrete_map={True: 'rgba(0,100,0,0.7)', False: 'rgba(220,20,60,0.7)'})
        fig.update_traces(textinfo='label+percent parent')
        figs.append(fig)
    
    for fig in figs:
        fig.add_trace(fig.data[0], row=1, col=1)

    # Update layout
    fig.update_layout(title_text=f"{title_prefix} Language Distribution")

    return fig

import numpy as np

def create_language_histogram(train_data, title_prefix):
    selected_langs = ['fi', 'ja', 'ru']
    df = train_data[train_data['lang'].isin(selected_langs)]
    
    fig = make_subplots(rows=1, cols=3, subplot_titles=selected_langs)
    
    for i, lang in enumerate(selected_langs, 1):
        lang_data = df[df['lang'] == lang]
        
        # Answerable questions (True)
        answerable_data = lang_data[lang_data['answerable']]['question_n_tokens']
        # Non-answerable questions (False)
        non_answerable_data = lang_data[~lang_data['answerable']]['question_n_tokens']
        
        # Calculate bin edges for both distributions
        bins = np.histogram_bin_edges(np.concatenate([answerable_data, non_answerable_data]), bins=50)
        
        # Compute histograms
        answerable_hist, _ = np.histogram(answerable_data, bins=bins)
        non_answerable_hist, _ = np.histogram(non_answerable_data, bins=bins)
        
        # Normalize histograms
        answerable_hist = answerable_hist / answerable_hist.sum()
        non_answerable_hist = non_answerable_hist / non_answerable_hist.sum()
        
        # Plot normalized histograms
        fig.add_trace(
            go.Bar(x=bins[:-1], y=answerable_hist,
                   name=f'{lang} - Answerable',
                   marker_color='blue',
                   opacity=0.7),
            row=1, col=i
        )
        
        fig.add_trace(
            go.Bar(x=bins[:-1], y=non_answerable_hist,
                   name=f'{lang} - Non-answerable',
                   marker_color='red',
                   opacity=0.7),
            row=1, col=i
        )
        
        # Update x-axis and y-axis labels
        fig.update_xaxes(title_text="Number of Tokens", row=1, col=i)
        fig.update_yaxes(title_text="Relative Frequency" if i == 1 else None, row=1, col=i)
    
    fig.update_layout(
        title_text=f"{title_prefix} - Distribution of Question Token Counts by Language and Answerability",
        barmode='overlay',
        height=500,
        width=1200
    )
    
    return fig


os.makedirs("plots", exist_ok=True)
# Create and display the plot
fig_train = create_language_histogram(ds_train, "Training Data")
fig_train.write_image("plots/week1_a_lang_token_distribution_normalized.png")

# Create and display plots for training data
""" fig_train = create_language_plots(ds_train, ds_val, "Training Data")
fig_train.write_image("plots/week1_a_dataset.png")
 """

In [None]:
#% (b) For each of the languages Finnish, Japanese and Russian, report the 5 most common 
#% words in the questions from the training set. What kind of words are they?

#collect all tokens from the questions, and count them
import MeCab

def get_top_words(df: pd.DataFrame, lang: str, n=5):
    df_lang = df[df['lang'] == lang].copy()
    
    if lang == 'ja':
        mecab = MeCab.Tagger("-Owakati")  # Initialize MeCab tokenizer
        df_lang.loc[:, 'words_question_tokens'] = df_lang['question'].apply(lambda x: mecab.parse(x).split())
    else:
        df_lang.loc[:, 'words_question_tokens'] = df_lang['question'].apply(lambda x: x.split(' '))
    
    all_tokens = np.concatenate(df_lang['words_question_tokens'].values)
    unique, counts = np.unique(all_tokens, return_counts=True)
    sorted_indices = np.argsort(counts)[::-1]
    top_unique_tokens = unique[sorted_indices][:n]
    top_tokens_dict = {token: int(count) for token, count in zip(top_unique_tokens, counts[sorted_indices][:n])}
    return top_tokens_dict

def visualize_top_tokens(
    df : pd.DataFrame, 
    lang : str, 
    n=5
):
    top_tokens = get_top_words(df, lang, n)
    fig = px.bar(x=list(top_tokens.keys()), y=list(top_tokens.values()), labels={'x':'Tokens', 'y':'Counts'})
    fig.update_layout(title=f"Top {n} Tokens in {lang}")
    fig.show()
    return fig, top_tokens

for lang in ['fi', 'ja', 'ru']:
    fig, top_tokens = visualize_top_tokens(ds_train, lang, 5)
    fig.write_image(f"plots/week1_b_top_5_tokens_{lang}.png")
    print(top_tokens)

In [None]:
#% (c) Implement a rule-based classifier that predicts whether a question is answerable 
#% or impossible, only using the document (context) and question. You may use machine 
#% translation as a component. Use the answerable field to evaluate it on the validation set. 
#% What is the performance of your classifier for each of the languages Finnish, Japanese and Russian?






