In [None]:
import re
import regex
import pandas as pd
import numpy as np
import emoji
import datetime
from collections import Counter
import datetime
import matplotlib.pyplot as plt
from os import path
import plotly.express as px
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
#function for matching the pattern of date and time in the text
def beginWithDateAndTime(text):
    pattern = '^\d{1,2}/\d{1,2}/\d{1,2}, \d{1,2}:\d{1,2}\S [AaPp][Mm] -'
    result = re.match(pattern, text)
    if result:
        return True
    return False

#function for finding the author of chat 
def FindAuthor(text):
    patterns = [
        '([\w]+):',                        # Nombre
        '([\w]+[\s]+[\w]+):',              # Nombre + Apellido
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # Nombre + Segundo Nombre + Apellido
        '([\w]+)[\u263a-\U0001f999]+:',    # Nombre con Emoji              
    ]
    pattern = '^' + '|'.join(patterns)
    result = re.match(pattern, text)
    if result:
        return True
    return False


In [None]:
#function to get data like time,date,author and message from chat
def getDataFromText(text):   
    splitLine = text.split(' - ') 
    dateTime = splitLine[0]
    message = ' '.join(splitLine[1:])
    if FindAuthor(message): 
        splitMessage = message.split(': ') 
        author = splitMessage[0] 
        message = ' '.join(splitMessage[1:])
    else:
        author = None
    return dateTime, author, message

In [None]:
parsedData = [] 
chatfilePath = 'whatsapp chat file path' 
with open(chatfilePath, encoding="utf-8") as fp:
    fp.readline() 
    messageBuffer = [] 
    datetime, author = None, None
    while True:
        line = fp.readline() 
        if not line: 
            break
        line = line.strip() 
        if beginWithDateAndTime(line): 
            if len(messageBuffer) > 0: 
                parsedData.append([dateTime, author, ' '.join(messageBuffer)]) 
            messageBuffer.clear() 
            dateTime, author, message = getDataFromText(line) 
            messageBuffer.append(message) 
        else:
            messageBuffer.append(line)
   


In [None]:
chat = pd.DataFrame(parsedData, columns=['DateTime', 'Author', 'Message']) 

In [None]:
chat.head()

In [None]:
chat_memebers=list(chat.Author.unique())
chat_memebers

In [None]:
chat["DateTime"] = pd.to_datetime(chat["DateTime"])
chat.info()

In [None]:
#new column weekday
chat['weekday'] = chat['DateTime'].apply(lambda x: x.day_name())
# new column month_sent
chat['month_sent'] = chat['DateTime'].apply(lambda x: x.month_name()) 
#column date
chat['date'] = [d.date() for d in chat['DateTime']] 
#column hour
chat['hour'] = [d.time().hour for d in chat['DateTime']]

In [None]:
#column urlcount
URLPATTERN = r'(https?://\S+)'
chat['urlcount'] = chat.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
#column Letter_Count
chat['Letter_Count'] = chat['Message'].apply(lambda s : len(s))
#column Word_Count
chat['Word_Count'] = chat['Message'].apply(lambda s : len(s.split(' ')))

In [None]:
def count_of_emojis(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
            emoji_list.append(word)
    
    return emoji_list

In [None]:
chat["emoji"] = chat["Message"].apply(count_of_emojis)

In [None]:
#This chart shows the number of messages per day
date_grouped = chat.groupby('date')['Message'].count().plot(kind='line', figsize=(20,10), color='orange')

In [None]:
#highest messages through the weekdays
weekday_grouped_msg =  (chat.set_index('weekday')['Message']
                          .groupby(level=0)
                          .value_counts()
                          .groupby(level=0)
                          .sum()
                          .reset_index(name='count'))
weekday_grouped_msg

fig = px.line_polar(weekday_grouped_msg, r='count', theta='weekday', line_close=True)
fig.update_traces(fill='toself')
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
    )),
  showlegend=True
)
fig.show()

In [None]:
#time of day is it most common to send messages in this group
hour_grouped_msg =  (chat.set_index('hour')['Message']
                          .groupby(level=0)
                          .value_counts()
                          .groupby(level=0)
                          .sum()
                          .reset_index(name='count'))
fig=px.bar(hour_grouped_msg,x='hour',y='count',labels={'hour':'24 hour period'},height=500)
fig.update_traces(marker_color='#EDCC8B', marker_line_color='#D4A29C',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Total Messages by Hour of the Day')
fig.show()


In [None]:
total_messages=chat.shape[0]
media_messges=chat[chat['Message']=='<Multimedia omitido>'].shape[0]
average_msg_words=chat['Word_Count'].mean()
average_msg_word=chat['Letter_Count'].mean()
average_message_day = chat.groupby('date')['Message'].count().mean()

In [None]:
print('Total Messages ',total_messages)
print('Media Message', media_messges)
print('Average Words by Messages', round(average_msg_words, 2))
print('Average Letters by Messages', round(average_msg_word, 2))
print('Average Message Per Day', round(average_message_day, 2))

In [None]:
#Average Messages bar graph
total_data=[]
total_data.extend((average_msg_words,average_msg_word,average_message_day))
print(total_data)
mylabels = ["Average Message Words", "Average letters", "Average message per day"]
plt.bar(mylabels,total_data)
plt.show()

In [None]:
#count of messages by each author
qty_message_author = chat['Author'].value_counts()
qty_message_author.plot(kind='barh',figsize=(20,10), color=['#D4A29C', '#E8B298', '#EDCC8B', '#BDD1C5', '#9DAAA2'])
qty_message_author

In [None]:
nlp = spacy.load('en_core_web_sm')
stopwords = list(STOP_WORDS)

In [None]:
common_words=chat[['Author','Message']].copy()

In [None]:
extra = ["<multimedia", "omitido>", "k", "d","si","multimedia", "omitido"]
stopwords=stopwords+extra
common_words['Message']=(
    common_words['Message']
    .str.lower()
    .str.split()
    .apply(lambda x:[item for item in x if item not in stopwords])
    .explode()
    .reset_index(drop=True)
)

word_dict=dict(Counter(common_words['Message']))
word_dict=sorted(word_dict.items(),reverse=True,key=lambda x:x[1])
word_dict=pd.DataFrame(word_dict,columns=['Word','Count'])

In [None]:
word_dict

In [None]:
#Most common words in chat
fig=px.bar(word_dict.head(10),x='Word',y='Count',labels={'words':'Commom words'},height=500)
fig.update_traces(marker_color='#EDCC8B', marker_line_color='#D4A29C',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Most Common words in Chat')
fig.show()

In [None]:
#adding colums like postive,negative and neutral for calculating sentiment of each text
chat["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in chat["Message"]]
chat["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in chat["Message"]]
chat["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in chat["Message"]]
print(chat.head())

In [None]:
postive_msgs=chat["Positive"].mean()
print("Positive messages score:",postive_msgs)
neutral_msgs=chat["Neutral"].mean()
print("Neutral messages score:",neutral_msgs)
negative_msgs=chat["Negative"].mean()
print("Negative messages score:",negative_msgs)

In [None]:
total_emojis_list = list(set([a for b in chat.emoji for a in b]))
total_emojis = len(total_emojis_list)
print('Sum of all used Emojis', total_emojis)

In [None]:
total_emojis_list = list([a for b in chat.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df.head(10)

In [None]:
#tree map of emojis in chat
fig = px.treemap(emoji_df, path= ['emoji'],
    values = emoji_df['count'].tolist(),
)
fig.show()