In [30]:
##imports
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from plotly.subplots import make_subplots
import os
from transformers import AutoTokenizer
from tqdm import tqdm
import plotly.graph_objects as go
from typing import List
from typing import Awaitable
from tqdm.asyncio import tqdm as async_tqdm
import nest_asyncio
from openai import AsyncOpenAI
import MeCab
import numpy as np
import itertools
import asyncio
from typing import Optional
from tqdm.asyncio import tqdm as async_tqdm
from spacy.lang.ja import Japanese
from spacy.lang.ru import Russian
from spacy.lang.fi import Finnish
import nltk
#download the knbc corpus
nltk.download('knbc')
nltk.download('stopwords')
from nltk.corpus import knbc
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

current_dir = os.getcwd()
if current_dir.endswith("week1"):
    os.chdir("..")
else:
    print("current dir", current_dir)

nest_asyncio.apply()

current dir /Users/HansPeter/Dev/ku_local/NLP-course/src/nlp_course


[nltk_data] Downloading package knbc to /Users/HansPeter/nltk_data...
[nltk_data]   Package knbc is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/HansPeter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Preprocessing
##Load the dataset, embeddings and tokenization

In [6]:
if os.path.exists("dataset/train_df.parquet"):
    ds_train = pd.read_parquet("dataset/train_df.parquet")
    ds_val = pd.read_parquet("dataset/val_df.parquet")
else:
    print("Dataset should be built, run embed_and_save first")
    #ds_train, ds_val = embed_and_save(ds_train, ds_val)
    ds = load_dataset("coastalcph/tydi_xor_rc")
    ds_val = ds["validation"].to_pandas()
    ds_train = ds["train"].to_pandas()
    ds_train = ds_train[ds_train['lang'].isin(['fi', 'ja', 'ru'])]
    ds_val = ds_val[ds_val['lang'].isin(['fi', 'ja', 'ru'])]
    print(len(ds_train))
    print(len(ds_val))

In [7]:
#compute embeddings for all
async def embed_chunk(chunk: List[str]) -> Awaitable[List[float]]:
    client = AsyncOpenAI()
    
    # Filter out empty strings and None values
    filtered_chunk = [text for text in chunk if text and isinstance(text, str)]
    
    if not filtered_chunk:
        print("Warning: Empty chunk after filtering")
        return [None] * len(chunk)  # Return None for each original item
    
    try:
        response = await client.embeddings.create(input=filtered_chunk, model='text-embedding-ada-002')
        embeddings = [r.embedding for r in response.data]
        
        # Pad the result with None for any filtered out items
        result = []
        filtered_index = 0
        for item in chunk:
            if item and isinstance(item, str):
                result.append(embeddings[filtered_index])
                filtered_index += 1
            else:
                result.append(None)
        
        return result
    except Exception as e:
        print(f"Error embedding chunk: {e}")
        raise ValueError(f"Got exception: {e} on chunk: {filtered_chunk}")


async def embed_all(df: pd.DataFrame):
    cols = ['question', 'context', 'answer_inlang', 'answer']
    batch_size = 128

    async def process_batch(task_id: int, batch: List[Optional[str]]) -> tuple[int, List[Optional[List[float]]]]:
        None_indices = [i for i, text in enumerate(batch) if text is None]
        batch = [text for text in batch if text is not None]
        try:
            embeddings = await embed_chunk(batch)
        except Exception as e:
            print(f"Error embedding batch: {e}")
            raise e
        
        for i in None_indices:
            embeddings.insert(i, None)
        return task_id, embeddings

    all_texts = []
    for col in cols:
        all_texts.extend(df[col].tolist())

    batches = [all_texts[i:i+batch_size] for i in range(0, len(all_texts), batch_size)]

    all_embeddings = []

    tasks = [process_batch(task_id=i, batch=batch) for i, batch in enumerate(batches)]

    async for embedding in async_tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Embedding"):
        all_embeddings.append(await embedding)

    # Sort embeddings by task_id to maintain order
    all_embeddings.sort(key=lambda x: x[0])
    
    # Flatten the embeddings list
    flattened_embeddings = [emb for _, batch_emb in all_embeddings for emb in batch_emb]

    # Assign embeddings to the correct columns in the DataFrame
    for i, col in enumerate(cols):
        start_idx = i * len(df)
        end_idx = (i + 1) * len(df)
        df[f'{col}_embedding'] = flattened_embeddings[start_idx:end_idx]

    return df

def embed_and_save(ds_train, ds_val):
    for name, df in [("train", ds_train), ("val", ds_val)]:
        df_with_embeddings = asyncio.run(embed_all(df))    
        path = "dataset"
        os.makedirs(path, exist_ok=True)
        df_with_embeddings.to_parquet(f"{path}/{name}_df.parquet", index=False)



##Tokenization

In [31]:
#tokenize 

# a list of manually set stopwords
other_stopwords = ['.', '?', '!', ',', ':', ';', '(', ')', '[', ']', '{', '}', '|', '\\', '/', '*', '+', '-', '=', '_', '^', '~', '<', '>', '\"', '\'', '…', '“', '”', '–', '—', '...', '..', '...']
#add spacing to stopwords
left_over_stopwords = [word + " " for word in other_stopwords]
right_over_stopwords = [" " + word for word in other_stopwords]
numbers_stopwords = [str(i) for i in range(2024)] 


all_stopwords = left_over_stopwords + right_over_stopwords + numbers_stopwords

nltk_finnish_stopwords = list(set(stopwords.words('finnish')))
nltk_japanese_stopwords = list(set(knbc.words()))
nltk_russian_stopwords = list(set(stopwords.words('russian')))
#get stop words from spacy
spacy_ja_stopwords = list(set(Japanese.Defaults.stop_words)) + all_stopwords 
spacy_ru_stopwords = list(set(Russian.Defaults.stop_words)) + all_stopwords
spacy_fi_stopwords = list(set(Finnish.Defaults.stop_words)) + all_stopwords


In [9]:

def tokenize_column(
    df : pd.DataFrame, 
    tokenizer : AutoTokenizer,
    tokenizer_name : str
):
    text_cols = ['question', 'context', 'answer']
    for col in tqdm(text_cols):
        df[f"{col}_{tokenizer_name}"] = df[col].apply(lambda x: tokenizer.encode(x))
        df[f"{col}_{tokenizer_name}_n_tokens"] = df[f"{col}_{tokenizer_name}"].apply(lambda x: len(x))
    return df


def spacy_tokenize_column(
    df: pd.DataFrame, 
    lang: str
):
    text_cols = ['question', 'context', 'answer']
    for col in tqdm(text_cols):
        #NOTE only the question is in the language, ['context', 'answer'] are in english
        if col == 'question':
            df[f"{col}_spacy{lang}"] = df[col].apply(lambda x: tokenizer.encode(x))
            df[f"{col}_spacy{lang}_n_tokens"] = df[f"{col}_spacy{lang}"].apply(lambda x: len(x))
            
    return df


#ds_train = tokenize_column(ds_train, tokenizer, "meta-llama/Meta-Llama-3.1-8B")
#ds_val = tokenize_column(ds_val, tokenizer, "meta-llama/Meta-Llama-3.1-8B")


In [37]:
#Summarize basic data statistics for train-
#ing and validation data in each of the languages Finnish (fi), Japanese
#(ja) and Russian (ru).

# distribution of answerable and not answerable questions
def create_language_plots(train_data, val_data, title_prefix):
    # Create subplots
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                        subplot_titles=(f"{title_prefix} - Training Data", 
                                        f"{title_prefix} - Validation Data"))

    # Only fi, ja, ru
    selected_langs = ['fi', 'ja', 'ru']
    dfs = [train_data, val_data]
    
    figs = []
    for df in dfs:
        selected_data = df[df['lang'].isin(selected_langs)]
        fig = px.sunburst(selected_data, path=['lang', 'answerable'],
                       color='lang',
                       color_discrete_map={True: 'rgba(0,100,0,0.7)', False: 'rgba(220,20,60,0.7)'})
        fig.update_traces(textinfo='label+percent parent')
        figs.append(fig)
    
    for fig in figs:
        fig.add_trace(fig.data[0], row=1, col=1)

    # Update layout
    fig.update_layout(title_text=f"{title_prefix} Language Distribution")

    return fig

import numpy as np


def create_language_histogram(train_data, title_prefix):
    selected_langs = ['fi', 'ja', 'ru']
    df = train_data[train_data['lang'].isin(selected_langs)]
    
    fig = make_subplots(rows=1, cols=3, subplot_titles=selected_langs)
    
    for i, lang in enumerate(selected_langs, 1):
        lang_data = df[df['lang'] == lang]
        
        # Answerable questions (True)
        answerable_data = lang_data[lang_data['answerable']]['question_meta-llama/Meta-Llama-3.1-8B_n_tokens']
        # Non-answerable questions (False)
        non_answerable_data = lang_data[~lang_data['answerable']]['question_meta-llama/Meta-Llama-3.1-8B_n_tokens']
        
        # Calculate bin edges for both distributions
        bins = np.histogram_bin_edges(np.concatenate([answerable_data, non_answerable_data]), bins=50)
        
        # Compute histograms
        answerable_hist, _ = np.histogram(answerable_data, bins=bins)
        non_answerable_hist, _ = np.histogram(non_answerable_data, bins=bins)
        
        # Normalize histograms
        answerable_hist = answerable_hist / answerable_hist.sum()
        non_answerable_hist = non_answerable_hist / non_answerable_hist.sum()
        
        # Plot normalized histograms
        fig.add_trace(
            go.Bar(x=bins[:-1], y=answerable_hist,
                   name=f'{lang} - Answerable',
                   marker_color='blue',
                   opacity=0.7),
            row=1, col=i
        )
        
        fig.add_trace(
            go.Bar(x=bins[:-1], y=non_answerable_hist,
                   name=f'{lang} - Non-answerable',
                   marker_color='red',
                   opacity=0.7),
            row=1, col=i
        )
        
        # Update x-axis and y-axis labels
        fig.update_xaxes(title_text="Number of Tokens", row=1, col=i)
        fig.update_yaxes(title_text="Relative Frequency" if i == 1 else None, row=1, col=i)
    
    fig.update_layout(
        title_text=f"{title_prefix} - Distribution of Question Token Counts by Language and Answerability",
        barmode='overlay',
        height=500,
        width=1200
    )
    
    return fig



In [41]:
from pydantic import BaseModel, Field
import instructor

class TranslationObject(BaseModel):
    '''i pydantic model for translating'''
    original_text: str = Field(description="The text to be translated")
    translated_text: str = Field(description="The translation of the original text")

async def translate_text(
    texts: List[str], 
    lang: str
) -> List[str]:
    client = instructor.from_openai(AsyncOpenAI())

    texts_str = '\n'.join(texts)
    response = await client.chat.completions.create(
        model="gpt-4o-mini", #dont change this line
        messages=[
            {
                "role": "system", 
                "content": f"You are a helpful assistant that translates text from English to {lang}."
            },
            {
                "role": "user", 
                "content": f"""
                Translate the following texts from {lang} into English:
                {texts_str}
                """
            }
        ],
        response_model=List[TranslationObject]
    )
    return [f"{translation.original_text} : {translation.translated_text}" for translation in response]


In [42]:
#% (b) For each of the languages Finnish, Japanese and Russian, report the 5 most common 
#% words in the questions from the training set. What kind of words are they?

def get_top_words(
    df: pd.DataFrame, 
    lang: str, 
    stopwords: List[str],
    n=5
):
    df_lang = df[df['lang'] == lang].copy()
    
    lang_map = {
        'ja': 'japanese',
        'ru': 'russian',
        'fi': 'finnish'
    }
    
    if lang == 'ja':
        df_lang.loc[:, 'words_question_tokens'] = df_lang['question'].apply(lambda x: [token.text for token in japanese_tokenizer(x)])
    else:
        df_lang.loc[:, 'words_question_tokens'] = df_lang['question'].apply(lambda x: word_tokenize(x, language=lang_map[lang]))
    
    
    all_tokens = np.concatenate(df_lang['words_question_tokens'].values)
    print("len before filtering", len(all_tokens))
    filtered_tokens = [word.lower() for word in all_tokens if word.isalnum() and word.lower() not in stopwords]
    print("len after filtering", len(filtered_tokens))
    
    unique, counts = np.unique(filtered_tokens, return_counts=True)
    sorted_indices = np.argsort(counts)[::-1]
    top_unique_tokens = unique[sorted_indices][:n]
    top_tokens_dict = {token: int(count) for token, count in zip(top_unique_tokens, counts[sorted_indices][:n])}
    return top_tokens_dict



async def visualize_top_tokens(
    df : pd.DataFrame, 
    lang : str, 
    stopwords : List[str],
    n=5
):
    top_tokens = get_top_words(df, lang, stopwords, n)
    
    
    top_tokens_list = list(top_tokens.keys())
    top_tokens_with_translation = await translate_text(top_tokens_list, lang)
    
    fig = px.bar(x=list(top_tokens_with_translation), y=list(top_tokens.values()), labels={'x':'Tokens', 'y':'Counts'})
    fig.update_layout(title=f"Top {n} Tokens in {lang}")
    fig.show()
    return fig, top_tokens

for stopwords, lang in [
    (spacy_ja_stopwords, 'ja'),
    (spacy_ru_stopwords, 'ru'),
    (spacy_fi_stopwords, 'fi')
]:
    fig, top_tokens = await visualize_top_tokens(ds_train, lang, stopwords, 5)
    for key in top_tokens.keys():
        key_in = key in stopwords
        print(f"{key} in stopwords: {key_in}")

    path = f"week1/plots/week1_b_top_5_tokens_{lang}.png"
    if os.path.exists(path):
        os.remove(path)
    fig.write_image(path)
    print(top_tokens)

len before filtering 24482
len after filtering 11556


何 in stopwords: False
誰 in stopwords: False
どこ in stopwords: False
日本 in stopwords: False
初めて in stopwords: False
{'何': 528, '誰': 315, 'どこ': 182, '日本': 152, '初めて': 132}
len before filtering 16526
len after filtering 8472


россии in stopwords: False
году in stopwords: False
каком in stopwords: False
март in stopwords: False
первый in stopwords: False
{'россии': 173, 'году': 134, 'каком': 121, 'март': 82, 'первый': 82}
len before filtering 13013
len after filtering 6933


vuonna in stopwords: False
määritellään in stopwords: False
syntynyt in stopwords: False
perustettu in stopwords: False
suomen in stopwords: False
{'vuonna': 227, 'määritellään': 90, 'syntynyt': 54, 'perustettu': 50, 'suomen': 50}


['これは日本語のサンプルテキストです。これは頻出語を見つけるためのテストです。']


In [None]:
#% (c) Implement a rule-based classifier that predicts whether a question is answerable 
#% or impossible, only using the document (context) and question. You may use machine 
#% translation as a component. Use the answerable field to evaluate it on the validation set. 
#% What is the performance of your classifier for each of the languages Finnish, Japanese and Russian?

from abc import ABC, abstractmethod

class RuleBasedClassifier(ABC):
    @abstractmethod
    def classify(self, question: str, context: str) -> bool:
        pass

class SemanticSimilarityClassifier(RuleBasedClassifier):
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def classify(self, question: str, context: str) -> bool:
        
        
        
        






