In [1]:
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import WhitespaceTokenizer

In [2]:
df = pd.read_csv('data/labeled.csv')

In [3]:
def clean(text):
    """
    Clear comments from urls, digits and special symbols.
    """
    text = text.str.lower()
    text = text.str.replace('http\S+|http.\S+', '', case=False, regex=True)
    text = text.str.replace('\d+', ' ', regex=True)
    text = text.str.replace('[^\w\s]',' ', regex=True)
    text = text.str.replace(r'[\n]',' ', regex=True)
    return text

tokenizing = WhitespaceTokenizer()
stemming = SnowballStemmer("russian")

def prepare_text(text):
    """
    Change comments with stemming, tokenizating and dropping stop words.
    """
    sw = stopwords.words('russian')
    clean_tokenized_text = [] 
    for raw in text:
        clean_tokenized_text.append(' '.join([stemming.stem(word) for word in tokenizing.tokenize(raw) if word not in sw]))
    return clean_tokenized_text

In [4]:
df.comment = prepare_text(clean(df.comment)) # Update data

In [5]:
df = df[df.comment != ''] # Drop empty raws

In [6]:
df.to_csv('data/labeled_clean.csv', index=False) # Save cleaned data