In [None]:
import pickle
from IPython.display import display, clear_output

def load_pkl(path):
  with open(path, 'rb') as f:
    return pickle.load(f)
  
def dump_pkl(path, data):
  with open(path, 'wb') as f:
    pickle.dump(data, f)

def display_iteration(func):
  iteraton = 1
  def wrapper(*args, **kwargs):
    nonlocal iteraton
    clear_output(wait=True)
    display(f"Iteration --> {iteraton}")
    iteraton += 1
    return func(*args, **kwargs)
  return wrapper

In [None]:
import json
from typing import Dict, List

with open('../reviews_parser/generated/sorted_by_2000.json', 'r') as f:
  reviews: Dict[str, List[str]] = json.load(f)
len(reviews['good'])

In [None]:
import pandas as pd

tagged_reivews: List[Dict[str, str]] = []
for label, texts in reviews.items():
  for text in texts:
    tagged_reivews.append({'label': label,
                           'text': text})

df_tagged_reivews = pd.DataFrame(tagged_reivews)
df_tagged_reivews = df_tagged_reivews.sample(frac=1, random_state=10).reset_index(drop=True)
df_tagged_reivews.head()

In [None]:
import spacy

nlp = spacy.load('ru_core_news_sm')

@display_iteration
def process_text(text: str) -> str:
  text = text.lower()
  doc = nlp(text)
  return ' '.join([token.lemma_ for token in doc if
                   not token.is_punct
                   and not token.is_space])

process_text('Говорят, что для людей думающих - жизнь комедия')

In [None]:
df_processed = df_tagged_reivews.copy()
df_processed['text'] = df_tagged_reivews['text'].apply(process_text)
df_processed.head()

In [None]:
dump_pkl(f"generated/df_processed_reviews_{df_processed.shape[0]}.pkl", df_processed)