## Preparing for pre-processing

### Installing and importing required modules

In [None]:
%pip install nltk pymystem3 matplotlib

In [25]:
import sqlite3
import re
import nltk
import datetime
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

### Connecting to the database and getting posts

In [27]:
conn = sqlite3.connect('/home/sh1ron/HSE/MLOps_pipeline/database.db')
cursor = conn.cursor()

cursor.execute("SELECT * FROM posts")
posts = cursor.fetchall()

cursor.execute("SELECT * FROM reactions")
reactions = cursor.fetchall()

conn.close()

## Data preprocessing

In [None]:
stop_words = set(stopwords.words('russian'))

m = Mystem()

lemmatized_posts = []
for post in posts:
   text = re.sub('[^а-яА-ЯёЁ\s]', '', post[2])
   text = re.sub('\s+', ' ', text)
   text = text.lower()
   
   tokens = word_tokenize(text, language='russian')
   lemmatized_tokens = [m.lemmatize(word)[0] if len(m.lemmatize(word)) > 0 else word for word in tokens if word not in stop_words]
   lemmatized_posts.append(lemmatized_tokens)

## Post analysis

### Finding the most popular words

In [None]:
flat_list = [item for sublist in lemmatized_posts for item in sublist]

word_counts = Counter(flat_list)
top_words = word_counts.most_common(5)

print(top_words)

### The ten most popular emotions

In [None]:
total_counts = defaultdict(int)

for reaction in reactions:
    emoji = reaction[2]
    if emoji is None:
        emoji = '❤'
    total_counts[emoji] += reaction[3]

sorted_emojis = sorted(total_counts.items(), key=lambda x: x[1], reverse=True)
top_emojis = sorted_emojis[:10]

print(top_emojis)

### The ten most popular posts for each of the most popular reactions

In [None]:
post_reactions = {}
for reaction in reactions:
    post_id = reaction[1]
    emoji = reaction[2]
    count = reaction[3]
    if post_id not in post_reactions:
        post_reactions[post_id] = {}
    post_reactions[post_id][emoji] = count

for reaction in top_emojis:
    emoji = reaction[0]
    reaction_count = reaction[1]
    
    filtered_posts = []
    for post in posts:
        if post[0] in post_reactions and emoji in post_reactions[post[0]]:
            filtered_posts.append((post, post_reactions[post[0]][emoji]))
    
    sorted_posts = sorted(filtered_posts, key=lambda x: x[1], reverse=True)

    top_10_posts = sorted_posts[:10]
    
    print(f"Top 10 posts for reaction '{emoji}':")
    for post, reaction_count in top_10_posts:
        print(f"Post ID: {post[0]}, Reaction Count: {reaction_count}")

### Date grouping

In [62]:
dates = []
for post in posts:
    date_str = post[4]
    date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S') 
    dates.append(date)

### Activity graph by month

In [None]:
month_year_counts = defaultdict(int)

for date in dates:
    month_year_str = f"{date.year}-{date.month:02}"
    month_year_counts[month_year_str] += 1

In [None]:
plt.figure(figsize=(10, 6))

plt.bar(month_year_counts.keys(), month_year_counts.values())
plt.title('Activity per Month')

plt.xlabel('Month')
plt.ylabel('Number of Posts')

plt.xticks(rotation=90) 
plt.show()

### Activity graph by week

In [63]:
week_counts = defaultdict(int)

for date in dates:
    if date.year == 2022 and date.month == 2:
        week_counts[date.isocalendar()[1]] += 1

In [None]:
plt.figure(figsize=(10, 6))

plt.bar(week_counts.keys(), week_counts.values())
plt.title('Activity in February 2022')

plt.xlabel('Week')
plt.ylabel('Number of Posts')

plt.show()

### Activity graph by day

In [50]:
day_counts = defaultdict(int)

for date in dates:
    if date.year == 2022 and date.month == 9:
        day_counts[date.day] += 1

In [None]:
plt.figure(figsize=(10, 6))

plt.bar(day_counts.keys(), day_counts.values())
plt.title('Activity in September 2022')

plt.xlabel('Day')
plt.ylabel('Number of Posts')

plt.show()