In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import math
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV
import pandas as pd
from itertools import combinations

In [8]:
def strings_to_position_tokens(strings):

    vocab = sorted(set("".join(strings)))
    vocab_dict = {char: idx for idx, char in enumerate(vocab)}
    vocab_size = len(vocab)

    result = []
    for s in strings:
        one_hot_string = []
        for char in s:
            one_hot = [0] * vocab_size
            one_hot[vocab_dict[char]] = 1
            one_hot_string.append(one_hot)
        result.append(one_hot_string)

    return result, vocab

def strings_to_char_positions(strings, vocab):
    vocab_dict = {char: idx for idx, char in enumerate(vocab)}
    return [
        [vocab_dict[char] for char in s if char in vocab_dict]
        for s in strings
    ]

In [49]:
df = pd.read_csv('1_Ground_truth\\Phoneme_Deleletion_ground_truth_FR.csv')
df = df[df['phase'] == 'REPETITION']
df = df[['config', 'trial_answer_coder2', 'accuracy_coder2']]

In [206]:
df_cleaned = df[~df.apply(lambda row: row.astype(str).str.contains(r"[{}]").any(), axis=1)].copy()

all_strings = df_cleaned['config'].astype(str).tolist() + df_cleaned['trial_answer_coder2'].astype(str).tolist()
_, vocab = strings_to_position_tokens(all_strings)

def string_to_onehot_list(s, vocab_dict, vocab_size):
    return [[1 if i == vocab_dict[c] else 0 for i in range(vocab_size)] for c in s if c in vocab_dict]


def string_to_gaussian_encoding(s, vocab_dict, vocab_size, sigma=1.0):
    def gaussian_vector(center, size, sigma):
        x = np.arange(size)
        gaussian = np.exp(-0.5 * ((x - center) / sigma) ** 2)
        return (gaussian / np.max(gaussian)).tolist()

    return [
        gaussian_vector(vocab_dict[c], vocab_size, sigma)
        for c in s if c in vocab_dict
    ]

vocab_dict = {char: i for i, char in enumerate(vocab)}
vocab_size = len(vocab)

df_cleaned['answered'] = df_cleaned['trial_answer_coder2'].astype(str).apply(lambda s: string_to_gaussian_encoding(s, vocab_dict, vocab_size))
target = strings_to_char_positions(df_cleaned['config'].astype(str).tolist(), vocab)

target_series = pd.Series(target, name='target_tokens')
df_cleaned_reset = df_cleaned.reset_index(drop=True)
target_series_reset = target_series.reset_index(drop=True)

X = pd.concat([df_cleaned_reset['answered'], target_series_reset, pd.Series([None] * len(df_cleaned_reset), name='third_column')], axis=1)

df_cleaned_reset['accuracy_coder2'] = df_cleaned_reset['accuracy_coder2'].fillna(0).astype(int)
y = df_cleaned_reset['accuracy_coder2'].apply(lambda x: int(x)).values

In [207]:
X['answered'] = X['answered'].apply(lambda x: [x])

In [None]:
def NonLinearPenalty(prob, confidence, sharpness=10):
    return 2 / (1 + math.exp(-sharpness * (prob - confidence)))

def WordScore(scores):
    product = 1.0
    for s in scores:
        product *= max(s, 1e-6)
    return product ** (1.0 / len(scores)) if scores else 0

def ordered_combinations(lst, r):
    return [list(combo) for combo in combinations(lst, r)]

def swap_consecutive_elements(lst):
    results = []
    for i in range(len(lst) - 1):
        swapped = lst.copy()
        swapped[i], swapped[i + 1] = swapped[i + 1], swapped[i]
        results.append(swapped)
    return results

In [None]:
class SpeechClassifier_FR(BaseEstimator, ClassifierMixin):
    def __init__(self, p_threshold=0.5, w_threshold=0.5, confidence=0.5, sharpness=1.0):
        self.p_threshold = p_threshold
        self.w_threshold = w_threshold
        self.confidence = confidence
        self.sharpness = sharpness

    def fit(self, X, y):
        self.words_ = X.iloc[:, 0]
        self.targets_ = X.iloc[:, 1]
        self.deletions_ = X.iloc[:, 2]
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        words_list = X.iloc[:, 0]
        targets_list = X.iloc[:, 1]
        deletions_list = X.iloc[:, 2] if X.shape[1] > 2 else [None] * len(X)

        predictions = []
        for words, target, deletion in zip(words_list, targets_list, deletions_list):
            result = self.classifier(target, words, deletion)
            predictions.append(int(result))
        return np.array(predictions)

    def classifier(self, target, words, deletion=None):
        S_t = len(target)
        for w in words:
            S_w = len(w)
            if S_w < S_t:
                continue
            for i in range(S_w - S_t + 1):
                Scores = []
                for pw, pt in zip(w[i:i + S_t], target):
                    if deletion is not None and i!=0:
                        if np.argmax(w[i-1]) == deletion:
                            break
                    s = 0
                    prob = pw[pt]
                    if prob > self.confidence:
                        s = 1
                    else:
                        s = NonLinearPenalty(prob, sharpness=self.sharpness, confidence=self.confidence)

                    Scores.append(s)

                if WordScore(Scores) > self.w_threshold:
                    for score in Scores:
                        if score < self.p_threshold:
                            break
                        return True
        return False
    

class SpeechClassifier_IT(BaseEstimator, ClassifierMixin):
    def __init__(self, p_threshold=0.5, w_threshold=0.5, confidence=0.5, sharpness=1.0):
        self.p_threshold = p_threshold
        self.w_threshold = w_threshold
        self.confidence = confidence
        self.sharpness = sharpness

    def fit(self, X, y):
        self.words_ = X.iloc[:, 0]
        self.targets_ = X.iloc[:, 1]
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        words_list = X.iloc[:, 0]
        targets_list = X.iloc[:, 1]

        predictions = []
        for words, target in zip(words_list, targets_list):
            result = self.classifier(target, words)
            predictions.extend(result)
        return np.array(predictions)


    def label(self, target, word):
        S_t = len(target)
        S_w = len(word)
        if S_w < S_t:
            return False
        for i in range(S_w - S_t + 1):
            Scores = []
            for pw, pt in zip(w[i:i + S_t], target):
                s = 0
                prob = pw[pt]
                if prob > self.confidence:
                    s = 1
                else:
                    s = NonLinearPenalty(prob, sharpness=self.sharpness, confidence=self.confidence)

                Scores.append(s)

            if WordScore(Scores) > self.w_threshold:
                for score in Scores:
                    if score < self.p_threshold:
                        break
                    return True
        return False
    
    def classifier(self,target,words):
        labels = []
        for i, word in enumerate(words):
            label = self.label(target[i], word)
            # For swaps, one element is classified as True and the other as False
            if label == False and i != len(words) - 1:
                label = self.label(target[i+1], words)
            labels.append(labels)
        return labels

'''def classifier(self, target, words):
    S_ws = len(words)
    S_t = len(target)

    if S_ws < S_t:
        list_permutations = ordered_combinations(target, S_ws)
    else:
        list_permutations = ordered_combinations(words, S_t)

    list_invertions = [swap_consecutive_elements(p) for p in list_permutations]

    list_permutations = [[p] + inv for p, inv in zip(list_permutations, list_invertions)]

    n_rows = max(len(col) for col in list_permutations)
    n_cols = len(list_permutations)
    scores = np.zeros((n_rows, n_cols))

    for j, col in enumerate(list_permutations):
        for i, c_words in enumerate(col):
            total_score = 0
            for k,w in enumerate(c_words):
                word_score = 0
                S_w = len(w)
                if S_w < S_t:
                    continue
                for k in range(S_w - S_t + 1):
                    Scores = []
                    for pw, pt in zip(w[k:k + S_t], target[k]):
                        prob = pw[pt]
                        if prob > self.confidence:
                            s = 1
                        else:
                            s = NonLinearPenalty(prob, sharpness=self.sharpness, confidence=self.confidence)
                        Scores.append(s)
                    word_score = max(word_score, WordScore(Scores))
            scores[i, j] = total_score

    max_idx_flat = np.argmax(scores)
    max_row, max_col = np.unravel_index(max_idx_flat, scores.shape)

    pred = []
    best_words = list_permutations[max_col][max_row]
    for i, word in enumerate(best_words):
        pred.append(self.score(target[i], word))

    return pred'''

In [236]:
X.iloc[:,0]

0       [[[0.1353352832366127, 0.6065306597126334, 1.0...
1       [[[2.572209372642415e-56, 1.3863432936411706e-...
2       [[[0.1353352832366127, 0.6065306597126334, 1.0...
3       [[[2.572209372642415e-56, 1.3863432936411706e-...
4       [[[1.522997974471263e-08, 3.726653172078671e-0...
                              ...                        
2217    [[[2.005008781961654e-37, 5.380186160021138e-3...
2218    [[[1.7556880978548265e-63, 2.572209372642415e-...
2219    [[[1.3838965267367376e-87, 4.0723586257611754e...
2220    [[[2.572209372642415e-56, 1.3863432936411706e-...
2221    [[[5.380186160021138e-32, 5.311092249679095e-2...
Name: answered, Length: 2222, dtype: object

In [None]:
param_grid = {
    'p_threshold': np.arange(0.1, 1.0, 0.2),
    'w_threshold': np.arange(0.1, 1.0, 0.2),
    'confidence': np.arange(0.1, 1.0, 0.2),
    'sharpness': np.arange(1, 10, 2),
}

grid = GridSearchCV(SpeechClassifier_FR(), param_grid, scoring='precision', cv=2)
grid.fit(X, y)

In [230]:
pred = grid.predict(X)

In [231]:
def get_fp_fn(y_true, y_pred):
    false_positives = [i for i, (yt, yp) in enumerate(zip(y_true, y_pred)) if yt == 0 and yp == 1]
    false_negatives = [i for i, (yt, yp) in enumerate(zip(y_true, y_pred)) if yt == 1 and yp == 0]
    return false_positives, false_negatives

fp, fn = get_fp_fn(y, pred)

In [232]:
df_cleaned['trial_answer_coder2'].reset_index(drop=True)[fn]

3                   onda
4                  euval
8                  anari
10                  ikan
11                   ato
              ...       
2216            am améra
2218    pr pro pro rotal
2219           s p pudio
2220             olucion
2221           k aravane
Name: trial_answer_coder2, Length: 1106, dtype: object

In [233]:
df_cleaned['trial_answer_coder2'].reset_index(drop=True)[fp]

69        glotte otre otte
91      gloire gloire oire
121           gatine itine
497                églotte
612               éfrisson
1327               ispalio
1502                aplage
2083                glotus
Name: trial_answer_coder2, dtype: object

In [234]:
grid.best_score_

0.9829645567904581