In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import lil_matrix

In [None]:
# 1. Класс генератора слов (алфавит, н-граммы)
# 2. Класс н-грамм
# 3. Класс подсчёта перплексий (?)
# 4. Статистический класс (?)
# 5. Класс обучения и выполнения модели (не только регрессия, но и остальные модели)
# 6. Класс перплексий

# Мб использовать стороннюю библиотеку для токенизации (создания n-грамм)
# Сторонняя библиотека для тестов (PyTest)
# Мб применить ради интереса нейросеть (или написать модуль обучения просто)

In [None]:
from abc import abstractmethod

In [None]:
# Можно протестить тестами всякие соответствия типов

In [None]:
class SpecificTypeError(TypeError):
  def __init__(self, real_type, expected_type):
    message = f'{expected_type} expected instead of {real_type}'
    super().__init__(message)

In [None]:
class EmptyError(ValueError):
  def __init__(self, type_object):
    message = f'{type_object} is empty'
    super().__init__(message)

In [None]:
# Есть какой-то алфавит
# Есть метод генерации слов

class WordGenerator:
  _words = []

  def __init__(self, words):
    if not (type(words) is list):
      raise SpecificTypeError(type(words).__name__, list.__name__)
    self._words = words
  
  @abstractmethod
  def generate_word(self):
    pass

In [None]:
from collections import Counter
from random import randint

class RandomAlphabetGenerator(WordGenerator):
  _alphabet = []
  _max_len = -1
  _min_len = -1

  def __init__(self, words, min_len, max_len):
    WordGenerator.__init__(self, words)
    self._min_len = min_len
    self._max_len = max_len

  def make_alphabet(self):
    self._alphabet = []

    for word in self._words:
      self._alphabet.extend([letter.lower() for letter in word])

    self._alphabet = list(Counter(self._alphabet).keys())
  
  def generate_word(self):
    if len(self._alphabet) == 0:
      raise EmptyError(type(self._alphabet).__name__)

    letters_list = []

    rand_len = randint(self._min_len, self._max_len)

    for i in range(rand_len):
      rand_letter = randint(0, len(self._alphabet) - 1)
      letters_list.append(self._alphabet[rand_letter])

    return ''.join(letters_list)

In [None]:
class Matrix:
  __matrix = lil_matrix

  def __init__(self, matrix):
    self.__matrix = matrix

  def __getitem__(self, row):
    return self.__matrix.getrow(row).toarray()

In [None]:
# Протестить корректность составления n-gram

class Ngrammer:

  @staticmethod
  def make_ngram( tokens, n):
    ngrams = []
    for i in range(0, len(tokens) - n + 1):
        ngrams.append(' '.join(tokens[i:i + n]))
    return ngrams

In [None]:
class AbstractNgramModel:
  _n = int
  _words = list
  _matrix = Matrix
  _rows = list
  _cols = list
  _rows_id = dict
  _cols_id = dict
  _ngrams = list

  @property
  def get_matrix(self):
    return self._matrix

  @property
  def get_rows(self):
    return self._rows

  @property
  def get_cols(self):
    return self._cols

  @property
  def get_rows_id(self):
    return self._rows_id

  @property
  def get_cols_id(self):
    return self._cols_id

  @property
  def get_ngrams(self):
    return self._ngrams

  def __init__(self, n, words):
    if not (type(words) is list):
      raise ValueError(f'List expected instead of {type(words).__name__}.')
    self._n = n
    self._words = words

  def _make_ngrams(self, tokens, n):
    return Ngrammer.make_ngram(tokens, n)

  @abstractmethod
  def make_model(self):
    pass

In [None]:
class NgramModelCreator:
  __n = -1
  __words = list

  def __init__(self, n, words):
    if not(type(n) is int):
      raise ValueError(f'List expected instead of {type(n).__name__}')
    self.__n = n
    self.__words = words

  @property
  def get_model(self) -> AbstractNgramModel:
    if self.__n == 1:
      return UnigramModel(self.__n, self.__words)
    else:
      return NgramModel(self.__n, self.__words)

In [None]:
class UnigramModel(AbstractNgramModel):

  def __init__(self, words):
    AbstractNgramModel.__init__(self, words)

  def make_model(self):
    raise ValueError('This method is not implemented.')

In [None]:
# Мб проверить статистические штуки у нграм

class NgramModel(AbstractNgramModel):

  def __init__(self, n, words):
    AbstractNgramModel.__init__(self, n, words)

  def make_model(self):
    self._ngrams = [Counter() for i in range(self._n)]

    self._ngrams = self.__generate_ngrams_list(self._ngrams, self._n)

    self._rows = list(self._ngrams[-2])
    self._rows_id = {gram: i for i, gram in enumerate(self._rows)}
    self._cols = list(self._ngrams[0])
    self._cols_id = {word: i for i, word in enumerate(self._cols)}

    self._matrix = self.__fill_matrix(self._n)

  def __generate_ngrams_list(self, ngrams, n):
    for word in self._words:
      ngrams[0].update(word)
      for i in range(2, n+1):
        ngrams[i-1].update(self._make_ngrams(word, n=i))
    return ngrams

  def __fill_matrix(self, n):
    all_matrix = lil_matrix((len(self._ngrams[-2]), 
                         len(self._ngrams[0])),)

    for ngram in self._ngrams[-1]:
        phrase = ngram.split()

        all_matrix[self._rows_id[' '.join(phrase[:n-1])], self._cols_id[phrase[-1]]] =  (self._ngrams[n-1][ngram]/
                                                                        self._ngrams[n-2][' '.join(phrase[:n-1])])
    
    return Matrix(all_matrix)

In [None]:
class NgramWordGenerator(WordGenerator):
  _ngrams = AbstractNgramModel

  @property
  def get_ngrams(self):
    return self._ngrams

  @get_ngrams.setter
  def set_ngrams(self, value):
    if not(type(value) is AbstractNgramModel):
      raise ValueError(f'List expected instead of {type(value).__name__}.')
    self._ngrams = value

  def __init__(self, words, ngrams):
    WordGenerator.__init__(self, words)
    if not(type(ngrams) is AbstractNgramModel):
      raise ValueError(f'List expected instead of {type(words).__name__}.')
    self._ngrams = ngrams

  def generate_word(self):
    word = []
    start_words = ['<start>'] * (len(self._ngrams.get_ngrams()) - 1)
    rows = self._ngrams.get_rows()
    cols = self._ngrams.get_cols()
    rows_id = self._ngrams.get_rows_id()
    matrix = self._get_matrix()
    current_idx = rows_id[start_words]
    while True:
        chosen = np.random.choice(matrix[current_idx].shape[1], p=matrix[current_idx].toarray()[0])
        word.append(cols[chosen])
        if cols[chosen] == '<end>':
            return ''.join(word)

In [None]:
# Проверить корректность вычисления перплексии

class PerplexityMetric:
  __words = list
  __ngrams = list
  __n = int

  def __init__(self, words, ngrams):
    self.__words = words
    self.__ngrams = ngrams
    self.__n = len(self.__ngrams)

  def __perplexity(self, probas):
    p = np.exp(np.sum(probas))
    N = len(probas)

    return p**(-1/N)

  def compute_perplexity(self):
    perplexities = []

    for word in self.__words:
      if len(word) <= self.__n-1:
        perplexities.append(np.mean(perplexities + [1]))
        continue

      prob = []
      
      for ngram in Ngrammer.make_ngram(word, n=self.__n):
          phrase = ngram.split()

          if ngram in self.__ngrams[-1]:
              if self.__n == 1:
                prob.append(np.log(self.__ngrams[0][ngram]/self.__ngrams[0][phrase[0]]))
              else:
                prob.append(np.log(self.__ngrams[-1][ngram]/self.__ngrams[self.__n-2][' '.join(phrase[:self.__n-1])]))
          else:
              prob.append(np.log(0.00001))
      
      perplexities.append(self.__perplexity(prob))

    return perplexities

In [None]:
!wget https://raw.githubusercontent.com/phon-dicts-project/comparative_andic_dictionary_database/master/andic_dicts.csv

--2022-03-15 15:52:09--  https://raw.githubusercontent.com/phon-dicts-project/comparative_andic_dictionary_database/master/andic_dicts.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40145266 (38M) [text/plain]
Saving to: ‘andic_dicts.csv’


2022-03-15 15:52:11 (199 MB/s) - ‘andic_dicts.csv’ saved [40145266/40145266]



In [None]:
!wget https://github.com/DashaChis/comparative_andic_dictionary_database/raw/master/andic_dicts_version_for_borrowings_annotation.xlsx

--2022-03-15 15:52:12--  https://github.com/DashaChis/comparative_andic_dictionary_database/raw/master/andic_dicts_version_for_borrowings_annotation.xlsx
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/DashaChis/comparative_andic_dictionary_database/master/andic_dicts_version_for_borrowings_annotation.xlsx [following]
--2022-03-15 15:52:12--  https://raw.githubusercontent.com/DashaChis/comparative_andic_dictionary_database/master/andic_dicts_version_for_borrowings_annotation.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8577758 (8.2M) [application/octet-stream]
Saving to: ‘andic_dicts

In [None]:
df1 = pd.read_csv('andic_dicts.csv')
df2 = pd.read_excel('andic_dicts_version_for_borrowings_annotation.xlsx')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df = pd.merge(df1, df2, how="left", on=["id"])

In [None]:
df = df[df['glottocode'] != 'toki1238']

In [None]:
preprocessed_lemmas = df['lemma'].str.replace(' ', '#')

In [None]:
# Проверить корректность реплейсера

class Preprocessor:
  __words = list

  @property
  def get_words(self):
    return self.__words

  def __init__(self, words):
    self.__words = words

  def replace_spaces(self, replacer='#'):
    self.__words = list(map(lambda x: x.replace(' ', replacer), self.__words))
    return self

In [None]:
# Сделать класс для препроцессинга
words = pd.Series([word for word in [[letter for letter in letters.lower()] for letters in preprocessed_lemmas]])

In [None]:
class DfPerplexityCreator:
    __df = pd.DataFrame

    def __init__(self, origin_df, lang_column, language):
      self.__df = origin_df[origin_df[lang_column] == language]

    def get_perplexity_column(self, n, words_column, target_column, condition):
      if condition == '~':
        words_serie = self.__df[~self.__df[target_column].isnull()][words_column].str.replace(' ', '#').str.lower()
      else:
        words_serie = self.__df[self.__df[target_column].isnull()][words_column].str.replace(' ', '#').str.lower()
      
      model = NgramModelCreator(n, list(self.__df[words_column].str.replace(' ', '#').str.lower())).get_model
      model.make_model()
      metric = PerplexityMetric(list(words_serie), model.get_ngrams)
      words_serie = pd.Series(metric.compute_perplexity(), index=words_serie.index)
      return words_serie

In [None]:
perps_df = df[['glottocode', 'lemma', 'bor_y']]
for i in range(2, 5):
  series = pd.Series()
  for language in perps_df['glottocode'].unique():
    df_creator = DfPerplexityCreator(perps_df, 'glottocode', language)
    series = series.append(df_creator.get_perplexity_column(i, 'lemma', 'bor_y', '~'))
    series = series.append(df_creator.get_perplexity_column(i, 'lemma', 'bor_y', ''))
  perps_df[f'{i}_gram'] = series

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
perps_df['is_borrowing'] = ~perps_df['bor_y'].isnull()
perps_df = perps_df.drop(columns=['bor_y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
perps_df

Unnamed: 0,glottocode,lemma,2_gram,3_gram,4_gram,is_borrowing
0,akhv1239,аба'далIи,9.760974,5.032016,4.194379,True
1,akhv1239,а/б/а'жве,11.465609,5.569418,12.236584,False
2,akhv1239,а/б/ажу'рулъIа,5.373334,3.370534,3.141429,False
3,akhv1239,а/б/ажу'рулъIа,5.373334,3.370534,3.141429,False
4,akhv1239,а/б/ажу'рулъIа,5.373334,3.370534,3.141429,False
...,...,...,...,...,...,...
89831,tind1238,Э́ЛĀЙЛЪ’А,5.559140,3.364785,1.332971,False
89832,tind1238,Э̄́ЛЪ’А¹,9.817690,2.504750,2.272958,False
89833,tind1238,Э̄́ЛЪ’А¹,9.817690,2.504750,2.272958,False
89834,tind1238,Э̄̀ЛЪ’А²,10.259337,2.838383,2.342888,False
