In [None]:
##imports
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from plotly.subplots import make_subplots
import os
from transformers import AutoTokenizer
from tqdm import tqdm
import plotly.graph_objects as go
from typing import List
from typing import Awaitable
from tqdm.asyncio import tqdm as async_tqdm
import nest_asyncio
from openai import AsyncOpenAI
import MeCab
import numpy as np
import itertools
import asyncio
from typing import Optional
from tqdm.asyncio import tqdm as async_tqdm
from spacy.lang.ja import Japanese
from spacy.lang.ru import Russian
from spacy.lang.fi import Finnish
import nltk
#download the knbc corpus
nltk.download('knbc')
nltk.download('stopwords')
from nltk.corpus import knbc
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

current_dir = os.getcwd()
if current_dir.endswith("code"):
    os.chdir("..")
else:
    print("current dir", current_dir)

nest_asyncio.apply()

#Preprocessing
##Load the dataset, embeddings and tokenization

In [61]:
if os.path.exists("dataset/train_df.parquet"):
    ds_train = pd.read_parquet("dataset/train_df.parquet")
    ds_val = pd.read_parquet("dataset/val_df.parquet")
else:
    print("Dataset should be built, run embed_and_save first")
    #ds_train, ds_val = embed_and_save(ds_train, ds_val)
    ds = load_dataset("coastalcph/tydi_xor_rc")
    ds_val = ds["validation"].to_pandas()
    ds_train = ds["train"].to_pandas()
    ds_train = ds_train[ds_train['lang'].isin(['fi', 'ja', 'ru'])]
    ds_val = ds_val[ds_val['lang'].isin(['fi', 'ja', 'ru'])]
    print(len(ds_train))
    print(len(ds_val))

In [60]:
#NOTE not currently used
def hf_tokenize_column(
    df : pd.DataFrame, 
    tokenizer : AutoTokenizer,
    tokenizer_name : str
):
    text_cols = ['question', 'context', 'answer']
    for col in tqdm(text_cols):
        df[f"{col}_{tokenizer_name}"] = df[col].apply(lambda x: tokenizer.encode(x))
        df[f"{col}_{tokenizer_name}_n_tokens"] = df[f"{col}_{tokenizer_name}"].apply(lambda x: len(x))
    return df


def tokenize_column(
    df : pd.DataFrame,
) -> pd.DataFrame:
    lang_map = {
        'ja': 'japanese',
        'ru': 'russian',
        'fi': 'finnish'
    }
    #japanese_tokenizer = Japanese()
    japanese_tokenizer = MeCab.Tagger("-Owakati")
    def tokenize_by_lang(row):
        if row['lang'] == 'ja':
            return japanese_tokenizer.parse(row['question']).split()
        else:
            return word_tokenize(row['question'], language=lang_map.get(row['lang'], 'english'))
    
    df['question_tokens'] = df.apply(tokenize_by_lang, axis=1)
    df['question_n_tokens'] = df['question_tokens'].apply(lambda x: len(x))
    df['context_tokens'] = df['context'].apply(lambda x: word_tokenize(x, language='english'))
    df['context_n_tokens'] = df['context_tokens'].apply(lambda x: len(x))
    print("context n tokens", df["context_n_tokens"])
    return df

if not 'question_tokens' in ds_train.columns:
    ds_train = tokenize_column(ds_train)
    ds_val = tokenize_column(ds_val)
    ds_train.to_parquet("dataset/train_df.parquet", index=False)
    ds_val.to_parquet("dataset/val_df.parquet", index=False)

# ... existing code ...


0       [As, the, Soviets, advanced, through, Poland, ...
1       [Another, consequence, of, the, Carlist, defea...
2       [The, traditional, date, for, the, founding, o...
3       [Currently, there, is, no, official, capital, ...
4       [Before, 1914, ,, the, Kingdom, of, Italy, bui...
                              ...                        
6405    [decided, to, hold, a, group, stage, ., All, t...
6406    [films, were, handled, by, Paramount, ,, in, c...
6407    [``, The, Collapse, of, Engineer, Garin, '', i...
6408    [Günter, Wilhelm, Grass, (, Günter, Wilhelm, G...
6409    [Angelika, Kallio, (, born, September, 15, ,, ...
Name: context_tokens, Length: 6410, dtype: object
0       233
1        99
2       101
3       122
4        59
       ... 
6405    137
6406    160
6407    151
6408    157
6409    129
Name: context_n_tokens, Length: 6410, dtype: int64


In [36]:
#compute embeddings for all
async def embed_chunk(chunk: List[str]) -> Awaitable[List[float]]:
    client = AsyncOpenAI()
    
    # Filter out empty strings and None values
    filtered_chunk = [text for text in chunk if text and isinstance(text, str)]
    
    if not filtered_chunk:
        print("Warning: Empty chunk after filtering")
        return [None] * len(chunk)  # Return None for each original item
    
    try:
        response = await client.embeddings.create(input=filtered_chunk, model='text-embedding-ada-002')
        embeddings = [r.embedding for r in response.data]
        
        # Pad the result with None for any filtered out items
        result = []
        filtered_index = 0
        for item in chunk:
            if item and isinstance(item, str):
                result.append(embeddings[filtered_index])
                filtered_index += 1
            else:
                result.append(None)
        
        return result
    except Exception as e:
        print(f"Error embedding chunk: {e}")
        raise ValueError(f"Got exception: {e} on chunk: {filtered_chunk}")

async def embed_all(df: pd.DataFrame):
    cols = ['question', 'context', 'answer_inlang', 'answer']
    batch_size = 128

    async def process_batch(task_id: int, batch: List[Optional[str]]) -> tuple[int, List[Optional[List[float]]]]:
        None_indices = [i for i, text in enumerate(batch) if text is None]
        batch = [text for text in batch if text is not None]
        try:
            embeddings = await embed_chunk(batch)
        except Exception as e:
            print(f"Error embedding batch: {e}")
            raise e
        
        for i in None_indices:
            embeddings.insert(i, None)
        return task_id, embeddings

    all_texts = []
    for col in cols:
        all_texts.extend(df[col].tolist())

    batches = [all_texts[i:i+batch_size] for i in range(0, len(all_texts), batch_size)]

    all_embeddings = []

    tasks = [process_batch(task_id=i, batch=batch) for i, batch in enumerate(batches)]

    async for embedding in async_tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Embedding"):
        all_embeddings.append(await embedding)

    # Sort embeddings by task_id to maintain order
    all_embeddings.sort(key=lambda x: x[0])
    
    # Flatten the embeddings list
    flattened_embeddings = [emb for _, batch_emb in all_embeddings for emb in batch_emb]

    # Assign embeddings to the correct columns in the DataFrame
    for i, col in enumerate(cols):
        start_idx = i * len(df)
        end_idx = (i + 1) * len(df)
        df[f'{col}_embedding'] = flattened_embeddings[start_idx:end_idx]

    return df

def embed_and_save(ds_train, ds_val):
    for name, df in [("train", ds_train), ("val", ds_val)]:
        df_with_embeddings = asyncio.run(embed_all(df))    
        path = "dataset"
        os.makedirs(path, exist_ok=True)
        df_with_embeddings.to_parquet(f"{path}/{name}_df.parquet", index=False)



##Tokenization

In [63]:
#tokenize 
from nltk.corpus import stopwords
# a list of manually set stopwords
other_stopwords = ['.', '?', '!', ',', ':', ';', '(', ')', '[', ']', '{', '}', '|', '\\', '/', '*', '+', '-', '=', '_', '^', '~', '<', '>', '\"', '\'', '…', '“', '”', '–', '—', '...', '..', '...']
#add spacing to stopwords
left_over_stopwords = [word + " " for word in other_stopwords]
right_over_stopwords = [" " + word for word in other_stopwords]
numbers_stopwords = [str(i) for i in range(2024)] 


all_stopwords = left_over_stopwords + right_over_stopwords + numbers_stopwords

nltk_finnish_stopwords = list(set(stopwords.words('finnish')))
nltk_japanese_stopwords = list(set(knbc.words()))
nltk_russian_stopwords = list(set(stopwords.words('russian')))
#get stop words from spacy
spacy_ja_stopwords = list(set(Japanese.Defaults.stop_words)) + all_stopwords 
spacy_ru_stopwords = list(set(Russian.Defaults.stop_words)) + all_stopwords
spacy_fi_stopwords = list(set(Finnish.Defaults.stop_words)) + all_stopwords


In [64]:
from pydantic import BaseModel, Field
import instructor

class TranslationObject(BaseModel):
    '''i pydantic model for translating'''
    original_text: str = Field(description="The text to be translated")
    translated_text: str = Field(description="The translation of the original text")

async def translate_text(
    texts: List[str], 
    lang: str
) -> List[str]:
    client = instructor.from_openai(AsyncOpenAI())

    texts_str = '\n'.join(texts)
    response = await client.chat.completions.create(
        model="gpt-4o-mini", #dont change this line
        messages=[
            {
                "role": "system", 
                "content": f"You are a helpful assistant that translates text from English to {lang}."
            },
            {
                "role": "user", 
                "content": f"""
                Translate the following texts from {lang} into English:
                {texts_str}
                """
            }
        ],
        response_model=List[TranslationObject]
    )
    return [f"{translation.original_text} : {translation.translated_text}" for translation in response]


In [70]:
#% (b) For each of the languages Finnish, Japanese and Russian, report the 5 most common 
#% words in the questions from the training set. What kind of words are they?

def get_top_words(
    df: pd.DataFrame, 
    lang: str, 
    stopwords: List[str],
    n=5
):
    df_lang = df[df['lang'] == lang].copy()
    
    # Flatten the list of tokens
    all_tokens = [token for tokens in df_lang['question_tokens'] for token in tokens]
    
    # Filter out stopwords and non-alphanumeric tokens
    filtered_tokens = [token.lower() for token in all_tokens if token.lower() not in stopwords and token.isalnum()]
    
    # Count occurrences
    token_counts = pd.Series(filtered_tokens).value_counts()
    
    top_tokens = token_counts.head(n)
    return dict(top_tokens)

async def visualize_top_tokens(
    df : pd.DataFrame, 
    lang : str, 
    stopwords : List[str],
    n=5
):
    top_tokens = get_top_words(df, lang, stopwords, n)
    
    
    top_tokens_list = list(top_tokens.keys())
    top_tokens_with_translation = await translate_text(top_tokens_list, lang)
    
    fig = px.bar(x=list(top_tokens_with_translation), y=list(top_tokens.values()), labels={'x':'Tokens', 'y':'Counts'})
    fig.update_layout(title=f"Top {n} Tokens in {lang}")
    fig.show()
    return fig, top_tokens

for stopwords, lang in [
    (spacy_ja_stopwords, 'ja'),
    (spacy_ru_stopwords, 'ru'),
    (spacy_fi_stopwords, 'fi')
]:
    fig, top_tokens = await visualize_top_tokens(ds_train, lang, stopwords, 5)
    for key in top_tokens.keys():
        key_in = key in stopwords
        print(f"{key} in stopwords: {key_in}")

    path = f"plots/week1_b_top_5_tokens_{lang}.png"
    if os.path.exists(path):
        os.remove(path)
    fig.write_image(path)
    print(top_tokens)

何      524
誰      315
どこ     182
日本     152
初めて    132
Name: count, dtype: int64


何 in stopwords: False
誰 in stopwords: False
どこ in stopwords: False
日本 in stopwords: False
初めて in stopwords: False
{'何': 524, '誰': 315, 'どこ': 182, '日本': 152, '初めて': 132}
россии    173
году      134
каком     121
март       82
первый     82
Name: count, dtype: int64


россии in stopwords: False
году in stopwords: False
каком in stopwords: False
март in stopwords: False
первый in stopwords: False
{'россии': 173, 'году': 134, 'каком': 121, 'март': 82, 'первый': 82}
vuonna          227
määritellään     90
syntynyt         54
perustettu       50
suomen           50
Name: count, dtype: int64


vuonna in stopwords: False
määritellään in stopwords: False
syntynyt in stopwords: False
perustettu in stopwords: False
suomen in stopwords: False
{'vuonna': 227, 'määritellään': 90, 'syntynyt': 54, 'perustettu': 50, 'suomen': 50}


In [71]:
#% (c) Implement a rule-based classifier that predicts whether a question is answerable 
#% or impossible, only using the document (context) and question. You may use machine 
#% translation as a component. Use the answerable field to evaluate it on the validation set. 
#% What is the performance of your classifier for each of the languages Finnish, Japanese and Russian?

from abc import ABC, abstractmethod

class RuleBasedClassifier(ABC):
    @abstractmethod
    def classify(self, question: str) -> bool:
        pass
    
    @abstractmethod
    def evaluate(self) -> float:
        pass

class SemanticSimilarityClassifier(RuleBasedClassifier):
    def __init__(
        self, 
        train_df: pd.DataFrame, 
        val_df: pd.DataFrame,
        lang: str,
        dims: int = 1536
    ):
        '''
        train_df: pd.DataFrame  the training data
        val_df: pd.DataFrame the validation data
        dims: int  the number of dimensions we use for the semantic space 
        note that openai uses a matryoshka embedding model so we can 
        slice the embedding vector and still get a good representation
        '''
        self.dims = dims
        self.lang = lang
        self.train_df = train_df[train_df['lang'] == lang]
        self.val_df = val_df[val_df['lang'] == lang]
        
    def collect_data(self):
        question_embedding = np.stack(self.train_df['question_embedding'].values)
        context_embedding = np.stack(self.train_df['context_embedding'].values)
        answerables = np.stack(self.train_df['answerable'].values)
        print(question_embedding.shape)
        print(context_embedding.shape)
        distances = np.linalg.norm(question_embedding - context_embedding, axis=1)
        
        return answerables, distances
        
        
    def classify(
        self, 
        question: List[str], 
        is_train: bool = False
    ) -> tuple[bool, float]:
        '''find the question embedding from the question'''
        if is_train:
            row = self.train_df[self.train_df['question'].isin(question)]
        else:
            row = self.val_df[self.val_df['question'].isin(question)]
            
        question_embedding = row['question_embedding'].values[0][:self.dims]
        context_embedding = row['context_embedding'].values[0][:self.dims]
        
        print(type(question_embedding))
        print(type(context_embedding))
        #we measure euclidean normalized distance between question and context
        dist = np.linalg.norm(question_embedding - context_embedding)
        return bool(dist < 0.5), dist 
            
    def evaluate(self, question: str):
        pass
    

In [73]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from typing import Literal

class TokenCountClassifier(RuleBasedClassifier):
    def __init__(
        self, 
        train_df: pd.DataFrame, 
        val_df: pd.DataFrame, 
    ):
        self.train_df = train_df
        self.val_df = val_df
        self.lang_map = {
            'ja': 'Japanese',
            'ru': 'Russian',
            'fi': 'Finnish'
        }

        
    def prepare_data(
        self, 
        questions: List[str], 
        is_train: bool = True,
        lang: Literal['ja', 'ru', 'fi', '_all'] = '_all',
    ) -> tuple[np.ndarray, np.ndarray]:
        if is_train:
            df = self.train_df
        else:
            df = self.val_df
            
        if lang != '_all':
            df = df[df['lang'] == lang]
            
        df = df[df['question'].isin(questions)]

        return df['question_n_tokens'].values, df['context_n_tokens'].values
        
    def gen_scatter_plot(
        self, 
        lang: Literal['ja', 'ru', 'fi', '_all'] = '_all'
    ):
        df = self.train_df
        if lang != '_all':
            df = df[df['lang'] == lang]
            
        question_tokens, context_tokens = self.prepare_data(
            df['question'].tolist(), 
            is_train=True, 
            lang=lang
        )

        answerable = df['answerable'].tolist()
        return px.scatter(
            title=f"Scatter plot of question and context tokens for {self.lang_map[lang] if lang != '_all' else 'all'} languages",
            x=question_tokens, 
            y=context_tokens, 
            color=answerable,
            color_discrete_map={False: 'blue', True: 'red'},
            labels={'x':'Question Tokens', 'y':'Context Tokens', 'color':'Answerable'}
        )
        
    def gen_plot(
        self, 
        lang: Literal['ja', 'ru', 'fi', '_all'] = '_all'
    ):
        df = self.train_df
        if lang != '_all':
            df = df[df['lang'] == lang]
            
        question_tokens, context_tokens = self.prepare_data(
            df['question'].tolist(), 
            is_train=True, 
            lang=lang
        )

        ratio = question_tokens / context_tokens
        return px.histogram(
            ratio, 
            title=f"Histogram of question to context token ratio for {self.lang_map[lang] if lang != '_all' else 'all'} languages",
            labels={'x':'Question to Context Token Ratio', 'y':'Count'},
            color=df['answerable'].tolist(),
            color_discrete_map={False: 'blue', True: 'red'}
        )
        
    def classify(
        self, 
        questions: List[str], 
        is_train: bool = True,
        lang: Literal['ja', 'ru', 'fi', '_all'] = '_all'
    ) -> np.ndarray:
        question_tokens, context_tokens = self.prepare_data(questions, is_train, lang)
        
        return question_tokens < 15
        

    def evaluate(self, lang: Literal['ja', 'ru', 'fi', '_all'] = '_all'):
        questions = self.val_df['question'].tolist()
        preds = self.classify(questions, is_train=False, lang=lang)
        if lang != '_all':
            labels = self.val_df[self.val_df['lang'] == lang]['answerable'].tolist()
        else:
            labels = self.val_df['answerable'].tolist()
            
        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds)
        precision = precision_score(labels, preds)
        recall = recall_score(labels, preds)
        # Create a confusion matrix
        cm = confusion_matrix(labels, preds)
        
        # Create a heatmap of the confusion matrix
        fig = px.imshow(cm,
                        labels=dict(x="Predicted", y="Actual", color="Count"),
                        x=['Not Answerable', 'Answerable'],
                        y=['Not Answerable', 'Answerable'],
                        text_auto=True,
                        color_continuous_scale='Blues')
        
        fig.update_layout(
            title=f"Confusion Matrix for {self.lang_map[lang] if lang != '_all' else 'all'} languages",
            xaxis_title="Predicted",
            yaxis_title="Actual"
        )
        
        # Save the figure
        path = f"plots/week1_c_confusion_matrix_{lang}.png"
        if os.path.exists(path):
            os.remove(path)
        fig.write_image(path)
        
        return fig, {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}


classifier = TokenCountClassifier(ds_train, ds_val)
question = ds_train['question']
print(classifier.evaluate())

for lang in ['ja', 'ru', 'fi']:
    """ fig = classifier.gen_scatter_plot(lang=lang)
    fig.show()
    path = f"week1/plots/week1_c_scatter_{lang}.png"
    if os.path.exists(path):
        os.remove(path)
    fig.write_image(path) """
    vals = classifier.evaluate(lang)
    vals[0].show()
    
    
    print(f"{lang}: {vals[1]}")



(Figure({
    'data': [{'coloraxis': 'coloraxis',
              'hovertemplate': 'Predicted: %{x}<br>Actual: %{y}<br>Count: %{z}<extra></extra>',
              'name': '0',
              'texttemplate': '%{z}',
              'type': 'heatmap',
              'x': [Not Answerable, Answerable],
              'xaxis': 'x',
              'y': [Not Answerable, Answerable],
              'yaxis': 'y',
              'z': array([[ 28, 368],
                          [ 43, 941]])}],
    'layout': {'coloraxis': {'colorbar': {'title': {'text': 'Count'}},
                             'colorscale': [[0.0, 'rgb(247,251,255)'], [0.125,
                                            'rgb(222,235,247)'], [0.25,
                                            'rgb(198,219,239)'], [0.375,
                                            'rgb(158,202,225)'], [0.5,
                                            'rgb(107,174,214)'], [0.625,
                                            'rgb(66,146,198)'], [0.75,
            

ja: {'accuracy': 0.6140350877192983, 'f1': 0.7411764705882353, 'precision': 0.6511627906976745, 'recall': 0.8600682593856656}


ru: {'accuracy': 0.75, 'f1': 0.8571428571428571, 'precision': 0.7538071065989848, 'recall': 0.9933110367892977}


fi: {'accuracy': 0.7424242424242424, 'f1': 0.8521739130434782, 'precision': 0.7424242424242424, 'recall': 1.0}
