In [3]:
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
import datetime
import collections
import csv
import config

plotly.tools.set_credentials_file(username=config.plotly_username, api_key=config.plotly_api_key)

conversation_path = 'conversations_plachno.tsv'

In [4]:
msg_df = pd.read_csv(conversation_path, sep='\t', header=None, names = ["author", "timestamp", "text", "type"], quoting=csv.QUOTE_NONE)

In [5]:
# clean the data
msg_df['text'] = msg_df['text'].str.lower()
msg_df.dropna()
msg_df = msg_df.drop(msg_df[msg_df['text'].str.contains('.*sent [a photo|an attachement|a sticker].', na=False)].index)
msg_df = msg_df.drop(msg_df[msg_df['text'].str.contains('^http', na=False)].index)
msg_df = msg_df.drop(msg_df[msg_df['text'].str.contains('.*https://9gag.com.*', na=False)].index)
msg_df = msg_df.drop(msg_df[msg_df['text'].str.contains('.* 󰀀.*', na=False)].index)

In [6]:
print('All msgs {}'.format(msg_df.shape[0]))
print('Avg length {}'.format(round(msg_df['text'].str.len().mean(), 2)))

msg_with_xd = msg_df[msg_df['text'].str.contains('[xX] ?[dD]+', na=False)]
print('Msgs with XD {}'.format(msg_with_xd.shape[0]))
print()

def statistics(author_name: str, msg_df: pd.DataFrame):
    author_msgs = msg_df[msg_df['author'] == author_name]
    print('{} msgs: {}, ({}%)'.format(author_name, author_msgs.shape[0], round(100 * author_msgs.shape[0] / msg_df.shape[0], 2)))
    print('{} avg length {}'.format(author_name, round(author_msgs['text'].str.len().mean(), 3)))
    msg_with_xd = author_msgs[msg_df['text'].str.contains('[xX] ?[dD]+', na=False)]
    print('{} msgs with XD {} ({}%)'.format(author_name, msg_with_xd.shape[0], round(100 * msg_with_xd.shape[0] / msg_df.shape[0], 2)))


authors = np.unique(msg_df['author'].values)
for author in authors:
    statistics(author, msg_df)
    print()


All msgs 26755
Avg length 27.37
Msgs with XD 111

My Friend msgs: 14677, (54.86%)
My Friend avg length 25.38
My Friend msgs with XD 1 (0.0%)

Piotr Konsek msgs: 12078, (45.14%)
Piotr Konsek avg length 29.793
Piotr Konsek msgs with XD 110 (0.41%)




Boolean Series key will be reindexed to match DataFrame index.



In [7]:
timestamps = msg_df['timestamp'].values
hours = [int(datetime.datetime.fromtimestamp(timestamp/1000).strftime('%H')) for timestamp in timestamps]
months = [int(datetime.datetime.fromtimestamp(timestamp/1000).strftime('%m')) for timestamp in timestamps]
years = [int(datetime.datetime.fromtimestamp(timestamp/1000).strftime('%Y')) for timestamp in timestamps]
weekdays = [datetime.datetime.fromtimestamp(timestamp/1000).strftime('%A') for timestamp in timestamps]

In [8]:
hour_density_list = np.zeros(24)

for hour in hours:
    hour_density_list[(hour - 6) % 24] += 1
    
x = np.hstack((np.arange(6, 24), np.arange(6)))
data = [go.Bar(x=x, y=hour_density_list)]
layout = go.Layout(
    title='Number of msgs in a specific hour of the day',
    xaxis = dict(
        type='category',
        title='hour',
    ),
    yaxis = dict(
        title='number of msgs'
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)


Consider using IPython.display.IFrame instead



In [9]:
weekday_counter = collections.Counter(weekdays)
x = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
y = [weekday_counter.get(key) for key in x]

data = [go.Bar(x=x, y=y)]
layout = go.Layout(
    title='Number of msgs in a specific day of the week',
    xaxis = dict(
        type='category',
        title='day of the week',
    ),
    yaxis = dict(
        title='number of msgs'
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

In [10]:
month_density = collections.Counter(months)
x = np.sort(list(month_density.keys()))
y = [month_density.get(key) for key in x]

data = [go.Bar(x=x, y=y)]
layout = go.Layout(
    title='Number of msgs in a specific month',
    xaxis = dict(
        type='category',
        title='month',
    ),
    yaxis = dict(
        title='number of msgs'
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

In [11]:
year_density = collections.Counter(years)
x = np.sort(list(year_density.keys()))
y = [year_density.get(key) for key in x]

data = [go.Bar(x=x, y=y)]
layout = go.Layout(
    title='Number of msgs in a specific year',
    xaxis = dict(
        type='category',
        title='year',
    ),
    yaxis = dict(
        title='number of msgs'
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)


In [12]:
messages_xd = msg_df[msg_df['text'].str.contains('[xX] ?[dD]+', na=False)]

In [13]:
import re

numbers_of_d = []
for message in messages_xd['text']:
    groups = re.findall('[xX] ?[dD]+', message)
    for group in groups:
        numbers_of_d.append(group.lower().count('d'))

In [14]:
numbers_of_d_dict = {}

for number_of_d in numbers_of_d:
    if number_of_d in numbers_of_d_dict.keys():
        numbers_of_d_dict[number_of_d] += 1
    else:
        numbers_of_d_dict[number_of_d] = 1
        
keys = list(numbers_of_d_dict.keys())
keys.sort()

x = keys
y = [numbers_of_d_dict[xx] for xx in x]

In [15]:
data = [go.Bar(x=x, y=y)]
layout = go.Layout(
    title='Number of D in XD',
    xaxis = dict(
        type='category',
        title='count of D'
    ),
    yaxis = dict(
        type='log',
        autorange=True,
        title='frequency (log scale)'
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)


In [16]:
import random
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def _red_color(word, font_size, position, orientation, random_state=None, **kwargs):
    return 'hsl({:d}, 80%, {:d}%)'.format(random.randint(0, 35), random.randint(60, 80))

messages = " ".join([str(msg) for msg in msg_df['text'].values])
prepared_stopwords = open('stop-words-polish1.txt').read().split('\n')
custom_stopwords = open('custom_polish_stopwords.txt').read().split('\n')

stopwords = prepared_stopwords + custom_stopwords  
stopwords = set(stopwords)

plt.figure(figsize=(20,10))
wordcloud = WordCloud(background_color='black',stopwords=stopwords, height=800, width=1600).generate(messages)
plt.imshow(wordcloud.recolor(color_func=_red_color, random_state=42), interpolation='bilinear')
plt.axis('off')
plt.savefig('word_cloud.png', dpi=200)
plt.show()

<Figure size 2000x1000 with 1 Axes>

In [17]:
msg_df.head()

Unnamed: 0,author,timestamp,text,type
0,My Friend,1544887084559,(y),Generic
1,Piotr Konsek,1544883352116,może naskrobie artykuł.na medium,Generic
2,Piotr Konsek,1544883339565,why not,Generic
3,Piotr Konsek,1544883328503,w sumie,Generic
4,My Friend,1544881607142,dobry materiał do opisania w blogpoaście,Generic
