## Preparing for pre-processing

### Installing and importing required modules

In [None]:
%pip install nltk
%pip install pymystem3

In [22]:
import sqlite3
import re
import nltk
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

### Connecting to the database and getting posts

In [19]:
conn = sqlite3.connect('/home/sh1ron/HSE/MLOps_pipeline/database.db')
cursor = conn.cursor()

cursor.execute("SELECT * FROM posts")
posts = cursor.fetchall()

cursor.execute("SELECT * FROM reactions")
reactions = cursor.fetchall()

conn.close()

## Data preprocessing

In [12]:
stop_words = set(stopwords.words('russian'))

m = Mystem()

lemmatized_posts = []
for post in posts:
   text = re.sub('[^а-яА-ЯёЁ\s]', '', post[2])
   text = re.sub('\s+', ' ', text)
   text = text.lower()
   
   tokens = word_tokenize(text, language='russian')
   lemmatized_tokens = [m.lemmatize(word)[0] if len(m.lemmatize(word)) > 0 else word for word in tokens if word not in stop_words]
   lemmatized_posts.append(lemmatized_tokens)

## Post analysis

### Finding the most popular words

In [None]:
flat_list = [item for sublist in lemmatized_posts for item in sublist]

word_counts = Counter(flat_list)
top_words = word_counts.most_common(5)

print(top_words)

### The ten most popular emotions

In [None]:
total_counts = defaultdict(int)

for reaction in reactions:
    emoji = reaction[2]
    if emoji is None:
        emoji = '❤'
    total_counts[emoji] += reaction[3]

sorted_emojis = sorted(total_counts.items(), key=lambda x: x[1], reverse=True)
top_emojis = sorted_emojis[:10]

print(top_emojis)