### Setup libraries

In [9]:
# A dependency of the preprocessing for BERT inputs
# !pip install tf-nightly
# !pip install -q -U tensorflow-text-nightly

In [2]:
# !pip install -q tf-models-official
# !pip install tf-models-official

In [3]:
# !git clone https://github.com/tensorflow/models.git

### Import libraries

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:72.5% !important; }</style>"))

In [2]:
import os
import shutil
import re

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import string
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from tqdm.notebook import tqdm
# from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

In [3]:
plt.rcParams.update({'font.size': 14})
pd.set_option('precision', 3)
pd.set_option('max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', 300)

### Paths to directories and files

In [3]:
DRAFT_DATASET_PATH = 'E:/kaggle/mailru/Otvety.txt'
TRAIN_DATASET_PATH = 'answers.txt'

### Loading data

In [4]:
# Small preprocess of the answers

question = None
written = False

with open(TRAIN_DATASET_PATH, 'w', encoding='utf-8') as fout:
    with open(DRAFT_DATASET_PATH, 'r', encoding='utf-8') as fin:
        for line in tqdm(fin):
            if line.startswith('---'):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace('\t', ' ').strip() + '\t' + line.replace('\t', ' '))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

0it [00:00, ?it/s]

In [11]:
text = []

with open(TRAIN_DATASET_PATH, 'r', encoding='utf-8') as fin:
    for line in tqdm(fin):
        text.append(line)        

0it [00:00, ?it/s]

In [30]:
from transformers import AutoModel, BertTokenizerFast

In [31]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
# encode text
sent_id = tokenizer.batch_encode_plus(text, padding=True)

# output
print(sent_id)

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train_text]

pd.Series(seq_len).hist(bins = 30)

In [26]:
# def preprocess_text(text):
#     text = re.sub('\n', '', text)
#     text = re.sub('---', '', text)
#     text = re.sub(':\)', '', text)
#     text = re.sub('\/+', '', text)
#     text = re.sub('[();"]', '', text)
#     text = re.sub('<[^>]+>', ' ', text)
    
#     text = re.sub('\s*\?\s*\.', '?', text)
#     text = re.sub('\s*\!\s*\.', '!', text)
#     text = re.sub('\s*\.', '.', text)
#     text = re.sub('\.+', '.', text)
    
#     return text

In [27]:
# corpus = preprocess_text(text)

In [28]:
# print(corpus[:100])

In [8]:
def preprocess_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

In [9]:
# Preprocess for models fitting

sentences = []

morpher = MorphAnalyzer()
sw = set(get_stop_words('ru'))
exclude = set(string.punctuation)
c = 0

with open(TRAIN_DATASET_PATH, 'r', encoding='utf-8') as fin:
    for line in tqdm(fin):
        spls = preprocess_txt(line)
        sentences.append(spls)
        c += 1
        if c > 5000:
            break

0it [00:00, ?it/s]

In [10]:
sentences

[['вопрос', 'тдв', 'отдыхать', 'лично', 'советовать', 'завести'],
 ['парень',
  'относиться',
  'цветной',
  'линза',
  'девушка',
  'зелёный',
  'глаз',
  'голубой',
  'вобщий',
  'прикалывать',
  'тема'],
 ['делать',
  'найти',
  '2',
  'миллион',
  'рубль',
  'счастие',
  'свалиться',
  'хороший',
  'пойти',
  'милиция',
  'заявить',
  'находка',
  'деньга',
  'тероть',
  'самый',
  'интересный',
  'неприменный',
  'искать',
  'поверьте',
  'найти',
  'видеть',
  'подобный',
  'нарваться',
  'бабушка',
  'помочий',
  'внук',
  'покупка',
  'квартира',
  'бандит',
  'разговаривать',
  'иначе',
  'бабушка',
  'милиция',
  'выбор',
  'шанс',
  'подарок',
  'выше',
  'котрый',
  'никто',
  'спросить',
  'хороший',
  'отдать',
  'хотяб',
  '500',
  'благотворительность',
  'дабы',
  'спугнуть',
  'удача'],
 ['эбу',
  'двенашка',
  'называться',
  'итэлма',
  'эбу',
  'эбу',
  '—',
  'электронный',
  'блок',
  'управление',
  'двигатель',
  'автомобиль',
  'название',
  '—',
  'контроллер

In [8]:
blank = '---'

In [9]:
def detect(tokens):
    return [t for t in tokens if t in valid_forms]
    
def replace_blank(tokens):
    return [blank if t in valid_forms else t for t in tokens]

def create_windows(tokens, window_size=3):
    X = []
    for i, word in enumerate(tokens):
        if word == blank:
            window = tokens[i-window_size:i] + tokens[i+1:i+window_size+1]
            window = ' '.join(window)
            X.append(window)    
    return X

In [11]:
from nltk.tokenize import wordpunct_tokenize

file = open(TRAIN_DATASET_PATH, 'r', errors='ignore')
text = file.read()
text = re.sub('[\n]', '', text)
tokens = wordpunct_tokenize(text)
y = detect(tokens)
tokens = replace_blank(tokens)
X = create_windows(tokens)
file.close()

MemoryError: 

In [None]:
df = pd.DataFrame()
df["Text"] = X
df["Label"] = y

In [None]:
df.head()

TypeError: 'module' object is not subscriptable

In [33]:
train_df.shape

(51660, 2)

https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/

https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/

https://www.kaggle.com/nielspace/text-classification-using-bert