Czy istnieją grupy postów, które są częściej komentowane, polecane, cytowane? Czym te grupy wpisów różnią się od innych?

In [1]:
import dask_mongo
import dask.dataframe as dd
import matplotlib.pyplot as plt
from stop_words import get_stop_words
import re
from wordcloud import WordCloud
from dask.diagnostics import ProgressBar, Profiler
import pyarrow as pa
from textblob import TextBlob
from datetime import datetime
import plotly.graph_objs as go

In [52]:
%%time

b = dask_mongo.read_mongo(
    database='ed23db',
    collection='tweets',
    chunksize=200_000,
    connection_kwargs={
        'host': 'localhost',
        'port': 27017
    }
)

CPU times: user 322 ms, sys: 65.7 ms, total: 388 ms
Wall time: 16min 53s


In [53]:
df = b.to_dataframe(meta={
    # '_id': 'str',
    'userid': 'str',
    'tweetid': 'str',
     'tweetcreatedts': 'str',
     'retweetcount': 'int32',
     'text': 'str',
     'hashtags': 'str',
     'language': 'str',
     'coordinates': 'str',
     'favorite_count': 'int32',
     'is_retweet': 'bool',
     'original_tweet_id': 'str',
     'original_tweet_userid': 'str',
     'original_tweet_username': 'str',
     'in_reply_to_status_id': 'str',
     'in_reply_to_user_id': 'str',
     'in_reply_to_screen_name': 'str',
     'is_quote_status': 'bool',
     'quoted_status_id': 'str',
     'quoted_status_userid': 'str',
     'quoted_status_username': 'str',
     'extractedts': 'str',
})

In [12]:
%%time

with ProgressBar(dt=2.0), Profiler() as prof2:
    df.shape[0].compute(), df.shape[1]

[                                        ] | 0% Completed | 20.37 sus


AutoReconnect: localhost:27017: connection closed

In [None]:
prof2.visualize()

# Parquet

In [1]:
list(df.columns.values)

NameError: name 'df' is not defined

In [55]:
df.head(5)

Unnamed: 0,userid,tweetid,tweetcreatedts,retweetcount,text,hashtags,language,coordinates,favorite_count,is_retweet,...,original_tweet_userid,original_tweet_username,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,is_quote_status,quoted_status_id,quoted_status_userid,quoted_status_username,extractedts
0,1237027988287471618,1579622809680035841,2022-10-11 00:00:00,11,Después de profanar la Mezquita de Ibrahim en ...,"[{'text': 'palestina', 'indices': [142, 152]},...",es,,13,True,...,0,,0,0,,True,0,0,,2022-10-11 09:57:02.393489
1,1407899290790473732,1579622809847808000,2022-10-11 00:00:00,1,La voix de son maître 🇺🇸\nAlors que l'#Ukraine...,"[{'text': 'Ukraine', 'indices': [37, 45]}, {'t...",fr,,0,True,...,0,,0,0,,True,0,0,,2022-10-11 02:31:18.024129
2,6509832,1579622810560786432,2022-10-11 00:00:00,1,#WarInUkraine | Putin Confirms Russia Attacked...,"[{'text': 'WarInUkraine', 'indices': [0, 13]},...",en,,5,True,...,0,,0,0,,True,0,0,,2022-10-11 04:37:34.820908
3,1570222426570608641,1579622810619723776,2022-10-11 00:00:00,1,We made a set of heart-shaped pinatas with the...,"[{'text': 'WeStandWithUkraine', 'indices': [16...",en,,4,True,...,0,,0,0,,True,0,0,,2022-10-11 05:41:38.847525
4,20918680,1579622810774749184,2022-10-11 00:00:00,0,Have you listened to @AmandaMcBroom1 &amp; @Mi...,"[{'text': 'GodofWar', 'indices': [80, 89]}, {'...",en,,0,True,...,0,,0,0,,True,0,0,,2022-10-11 02:31:18.004654


In [56]:
df['tweetcreatedts'] = df['tweetcreatedts'].map(lambda dt: dt[:19])
df['tweetcreatedts'] = dd.to_datetime(df['tweetcreatedts'], format='%Y-%m-%d %H:%M:%S')

df['extractedts'] = df['extractedts'].map(lambda dt: dt[:19])
df['extractedts'] = dd.to_datetime(df['extractedts'], format='%Y-%m-%d %H:%M:%S')

In [None]:
df.dtypes

In [58]:
%%time

with ProgressBar(dt=2.0), Profiler() as prof:
    df.to_parquet(
        '../parquet/tweets.parquet',
        engine='pyarrow',
        compression=None,
        overwrite=True,
    )

CPU times: user 49min 44s, sys: 4min 7s, total: 53min 52s
Wall time: 49min 19s


# Start here

## Bakhmut

In [2]:
%%time
dfp = dd.read_parquet('../parquet/tweets.parquet', engine='pyarrow')

CPU times: user 51 ms, sys: 12.5 ms, total: 63.5 ms
Wall time: 650 ms


In [3]:
%%time

with ProgressBar(dt=2.0), Profiler() as prof3:
    print(dfp.shape[0].compute(), df.shape[1])

[                                        ] | 1% Completed | 4.11 s us


KeyboardInterrupt: 

In [13]:
%%time

with ProgressBar(dt=2.0), Profiler() as prof3:
    # Earliest and Latest tweet
    earliest_tweet = dfp["tweetcreatedts"].min().compute()
    latest_tweet = dfp["tweetcreatedts"].max().compute()

print(f"The earliest tweet was at {earliest_tweet}, and the latest was at {latest_tweet}")

[########################################] | 100% Completed | 10.02 s
[########################################] | 100% Completed | 4.02 ss
The earliest tweet was at 2023-01-01 00:00:00, and the latest was at 2023-03-01 00:00:00
CPU times: user 14.1 s, sys: 2.33 s, total: 16.5 s
Wall time: 14.1 s


In [4]:
def preprocessing(data, start_date: datetime, end_date: datetime, topic: str):
    data = data[(dfp['tweetcreatedts'] >= start_date) & (dfp['tweetcreatedts'] <= end_date)]

    # drop rows with missing values in the 'text' column
    data = data.dropna(subset=['text'])

    if topic == "Bakhmut":
        # Filter tweets containing the keyword "Bakhmut" with regex
        data = data[data['text'].str.contains(r'Bakhmut|Бахмут|Арт[её]м[іi]вськ|Artyomovsk|Artemivsk', na=False)]
    elif topic == "Mariupol":
        # Filter tweets containing the keyword "Mariupol" with regex
        data = data[data['text'].str.contains(r'Mariupol|Маріуполь|Мариуполь|Маріупол|Маріу́поль|Мариу́поль', na=False)]

    # Filter the tweets by language
    dfp_en = data[data['language'] == 'en']
    dfp_ru = data[data['language'] == 'ru']
    dfp_uk = data[data['language'] == 'uk']

    return (dfp_en, dfp_ru, dfp_uk)

In [5]:
start_date = datetime(2023, 2, 1)
end_date = datetime(2023, 3, 20)

dfp_en, dfp_ru, dfp_uk = preprocessing(dfp, start_date, end_date, topic="Bakhmut")

In [6]:
# Define a function to get the sentiment of a tweet
def get_sentiment(tweet):
    blob = TextBlob(tweet)
    return blob.sentiment.polarity

In [None]:
# Get the sentiment for each tweet in English
dfp_en['sentiment'] = dfp_en['text'].apply(get_sentiment)

# Get the sentiment for each tweet in Russian
dfp_ru['sentiment'] = dfp_ru['text'].apply(get_sentiment)

# Get the sentiment for each tweet in Ukrainian
dfp_uk['sentiment'] = dfp_uk['text'].apply(get_sentiment)

with ProgressBar(dt=2.0), Profiler() as prof4:
    # Group the tweets by date and get the mean sentiment for each date
    en_sentiment = dfp_en.groupby(dfp_en['tweetcreatedts'].dt.date)['sentiment'].mean().compute()
    ru_sentiment = dfp_ru.groupby(dfp_ru['tweetcreatedts'].dt.date)['sentiment'].mean().compute()
    uk_sentiment = dfp_uk.groupby(dfp_uk['tweetcreatedts'].dt.date)['sentiment'].mean().compute()

[                                        ] | 0% Completed | 169.63 us

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('text', 'float64'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('text', 'float64'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, plea

[#                                       ] | 3% Completed | 28.61 s

In [None]:
en_sentiment.head(20)

In [None]:
def draw_sentiment(en_sentiment, ru_sentiment, uk_sentiment, title):
    # Sort the dataframe by datetime
    en_sentiment = en_sentiment.sort_index()
    ru_sentiment = ru_sentiment.sort_index()
    uk_sentiment = uk_sentiment.sort_index()

    # create traces for each language
    en_trace = go.Scatter(x=en_sentiment.index, y=en_sentiment.values, name='English')
    ru_trace = go.Scatter(x=ru_sentiment.index, y=ru_sentiment.values, name='Russian')
    uk_trace = go.Scatter(x=uk_sentiment.index, y=uk_sentiment.values, name='Ukrainian')

    # create the layout
    layout = go.Layout(
        title=title,
        xaxis=dict(title='Date', tickangle=45),
        yaxis=dict(title='Sentiment Score'),
        legend=dict(title='Language')
    )

    # create the figure and add traces and layout
    fig = go.Figure(data=[en_trace, ru_trace, uk_trace], layout=layout)

    # show the plot
    fig.show()

In [None]:
%%time
draw_sentiment(en_sentiment, ru_sentiment, uk_sentiment, 'Sentiment Analysis of Tweets - Bakhmut')

In [None]:
# Removing URLs, stopwords and single token
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove single token words
    tokens = text.split()
    tokens = [token for token in tokens if len(token) > 1]
    text = ' '.join(tokens)

    # Remove stop words
    russian_stop_words = get_stop_words('ru')
    english_stop_words = get_stop_words('en')
    ukrainian_stop_words = get_stop_words('uk')
    stop_words = russian_stop_words + english_stop_words + ukrainian_stop_words
    stop_words += ['Bakhmut', 'Бахмут', 'Артемівськ', 'Artyomovsk', 'Artemivsk']
    tokens = text.split()
    tokens = [token for token in tokens if token.lower() not in stop_words]
    text = ' '.join(tokens)

    return text

# Execute the function
dfp_en['text'] = dfp_en['text'].apply(preprocess_text, meta=('text', 'str'))
dfp_ru['text'] = dfp_ru['text'].apply(preprocess_text, meta=('text', 'str'))
dfp_uk['text'] = dfp_uk['text'].apply(preprocess_text, meta=('text', 'str'))

In [None]:
with ProgressBar(dt=2.0), Profiler() as prof7:
    # Group the tweets by date and concatenate the text of all tweets in each group
    en_text = ' '.join(dfp_en.groupby(dfp_en['tweetcreatedts'].dt.date)['text'].apply(' '.join))
    ru_text = ' '.join(dfp_ru.groupby(dfp_ru['tweetcreatedts'].dt.date)['text'].apply(' '.join))
    uk_text = ' '.join(dfp_uk.groupby(dfp_uk['tweetcreatedts'].dt.date)['text'].apply(' '.join))

with ProgressBar(dt=2.0), Profiler() as prof5:
    # Generate a wordcloud for each group of tweets
    en_cloud = WordCloud(background_color='white').generate(en_text)
    ru_cloud = WordCloud(background_color='white').generate(ru_text)
    uk_cloud = WordCloud(background_color='white').generate(uk_text)

# Plot the wordclouds
fig, ax = plt.subplots(1, 3, figsize=(20, 10))
ax[0].imshow(en_cloud)
ax[0].set_title('English')
ax[0].axis('off')
ax[1].imshow(ru_cloud)
ax[1].set_title('Russian')
ax[1].axis('off')
ax[2].imshow(uk_cloud)
ax[2].set_title('Ukrainian')
ax[2].axis('off')

with ProgressBar(dt=2.0), Profiler() as prof6:
    plt.show()

# Mariupol

In [None]:
%%time
dfp = dd.read_parquet('../parquet/tweets.parquet', engine='pyarrow')

In [None]:
mar_start_date = datetime(2022, 4, 1)
mar_end_date = datetime(2022, 5, 30)
dfp_en, dfp_ru, dfp_uk = preprocessing(dfp, mar_start_date, mar_end_date, topic="Mariupol")

In [None]:
# Get the sentiment for each tweet in English
dfp_en['sentiment'] = dfp_en['text'].apply(get_sentiment)

# Get the sentiment for each tweet in Russian
dfp_ru['sentiment'] = dfp_ru['text'].apply(get_sentiment)

# Get the sentiment for each tweet in Ukrainian
dfp_uk['sentiment'] = dfp_uk['text'].apply(get_sentiment)

with ProgressBar(dt=2.0), Profiler() as prof4:
    # Group the tweets by date and get the mean sentiment for each date
    en_sentiment_mar = dfp_en.groupby(dfp_en['tweetcreatedts'].dt.date)['sentiment'].mean().compute()
    ru_sentiment_mar = dfp_ru.groupby(dfp_ru['tweetcreatedts'].dt.date)['sentiment'].mean().compute()
    uk_sentiment_mar = dfp_uk.groupby(dfp_uk['tweetcreatedts'].dt.date)['sentiment'].mean().compute()

In [None]:
draw_sentiment(en_sentiment_mar, ru_sentiment_mar, uk_sentiment_mar, 'Sentiment Analysis of Tweets - Mariupol')