In [None]:
!pip3 install unidecode

In [None]:
!pip3 install sentiment-analysis-spanish

In [None]:
!pip3 install nltk

In [None]:
import re
import datetime
import pandas as pd
from sentiment_analysis_spanish import sentiment_analysis
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from collections import Counter
import nltk
from nltk.corpus import stopwords

In [None]:
message_regex = r"(\d+/\d+/\d+ \d+:\d+ \w.\sm.) - ([^\:]+): (.*)"

In [None]:
def build_date(date_str):
    date_regex = r'(\d+)/(\d+)/(\d+) (\d+):(\d+) (\w).\sm.'
    date_match = re.match(date_regex, date_str)
    return datetime.datetime(int(date_match[3]), int(date_match[2]), int(date_match[1]))

In [None]:
def post_process(str):
    match = re.match(message_regex, str)
    date = match[1]
    user = match[2]
    message = match[3]

    return (build_date(date), user, message)

In [None]:

conversations = []

with open('conversation.txt') as fp:
    for entry in fp:
        # normalize
        entry = entry.replace("\xa0", " ").strip()

        match = re.match(message_regex, entry)
        if not match and len(conversations) > 0:
            conversations[-1] += f"\n {entry}"
            continue

        conversations.append(entry)

del conversations[0]

In [None]:
post_process_message = [post_process(message) for message in conversations]

## Dataframe

In [None]:
df = pd.DataFrame(post_process_message)
df.columns = ["date", "user", "message"]

In [None]:
df

In [None]:
users = df.user
users = set(list(users))

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)  

for user in users:
    plt.plot(df[df.user==user].groupby('date').message.count())

plt.plot(df.groupby('date').message.count())
plt.legend(list(users) + ['total'])

plt.title("Messages vs Date")
plt.xlabel("Date")
plt.ylabel("Messages")

ax.xaxis.set_major_locator(mdates.DayLocator(interval=30))

plt.show()

In [None]:
## Messages by day
df.groupby('date').message.count().sort_values(ascending=False)

## Sentimental analysis

In [None]:
sentiment = sentiment_analysis.SentimentAnalysisSpanish()

In [None]:
df_without_images = df[~df.message.str.contains(f'<.+>')]
df_without_images['sentiments'] = [sentiment.sentiment(message) for message in list(df_without_images.message)]
df_without_images

In [None]:
## Messages by day
emotional_days = df_without_images.groupby('date')['sentiments'].agg('sum').sort_values(ascending=False)
print("Most emotional days")
print(emotional_days)

In [None]:
# Sad messages
sad = df_without_images.sort_values('sentiments', ascending=True)
for user in users:
    sad_msg_by_users = sad[sad.user == user][['message', 'sentiments']]

    print(f"Sad messages by {user}")
    print(sad_msg_by_users.head())



In [None]:
# Happy messages
happy = df_without_images.sort_values('sentiments', ascending=False)
for user in users:
    happy_msg_by_users = happy[happy.user == user][['message', 'sentiments']]

    print(f"Happy messages by {user}")
    print(happy_msg_by_users.head())


In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)  

for user in users:
    plt.plot(df_without_images[df.user==user].groupby('date')['sentiments'].agg("mean"))

plt.plot(df_without_images.groupby('date')['sentiments'].agg("mean"))
plt.legend(list(users) + ['total'])

plt.title("Sentiment vs Date")
plt.xlabel("Date")
plt.ylabel("Sentiment")

ax.xaxis.set_major_locator(mdates.DayLocator(interval=30))

plt.show()

## Most used words

In [None]:
nltk.download('stopwords')

In [None]:
spanish_stopwords = set(stopwords.words('spanish')+ ['mas', 'jaja', 'jajaja', 'jajajaja']) 

for user in users:
    query = df_without_images[df.user==user].message
    long_str = " ".join(list(query))
    long_str = long_str.replace("\n", " ").lower() 
    words = long_str.split()
    important_words = filter(lambda x: x not in spanish_stopwords, words) 
    counter = Counter(important_words).most_common(20)

    print(user)
    for id, c in enumerate(counter):
        print(f"{id + 1} {c[0]}")

## who initiates more the conversation

In [None]:
first_message_of_each_day = df.groupby('date').first().user

for user in users:
    count = first_message_of_each_day[first_message_of_each_day==user]
    print(f"Conversations started by {user}: {len(count)}")

## Who says more i love you

In [None]:
love_messages = df[df.message.str.contains(f'te quiero|te amo|te adoro')].user
for user in users:
    count = love_messages[love_messages==user]
    print(f"Love messages by {user}: {len(count)}")


## How many images send

In [None]:
df[df.message.str.contains(f'<.+>')]

In [None]:
# How many images senty
images_sent = df[df.message.str.contains(f'<.+>')].user

for user in users:
    count = images_sent[images_sent==user]
    print(f"Images sended by {user}: {len(count)}")

In [None]:
images = df[df.message.str.contains(f'<.+>')]

fig = plt.figure()
ax = fig.add_subplot(1,1,1)  

for user in users:
    plt.plot(images[df.user==user].groupby('date').agg("count"))

plt.plot(images.groupby('date').agg("count"))
plt.legend(list(users) + ['total'])

plt.title("Images vs Date")
plt.xlabel("Date")
plt.ylabel("Images")

ax.xaxis.set_major_locator(mdates.DayLocator(interval=30))

plt.show()