##### Setting up the Environment

In [22]:
import pandas as pd
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
import re
import matplotlib.pyplot as plt

### Add Interactivity to the Visualizations

In [23]:
# Standard data science helpers
import numpy as np
import pandas as pd
import scipy

# Instantiate the Plotly charting library.
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
# We use plotly.offline as this allows us to create interactive 
# visualisations without the use of an internet connection, 
# making our notebook more distributable to others. 
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

# The Cufflinks library allows us to directly bind 
# Pandas DataFrames to Plotly charts. 
import cufflinks as cf
# Once again, we use the Cufflinks library in offline mode. 
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options. We use these to make our interactive 
# visualisations more aesthetically appealing. 
from IPython.core.display import HTML
pd.options.display.max_rows = 30
pd.options.display.max_columns = 25

# Show all code cells outputs.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

> Since pandas reads data assuming its in tabular format, I had to read the data line from the txt file and convert it into pandas DataFrame for manipulation.
Otherwise I encountered the Parsing errors like `Expected 4 fields in line 621, saw 6`

In [24]:
with open("WhatsApp Chat with ALX Founder Academy Kenya(Peer Community).txt",'r',encoding='utf-8')as file:
    lines=file.readlines()
    
df=pd.DataFrame(lines,columns=["text"])
df.head()

Unnamed: 0,text
0,"18/11/2024, 10:05 - Messages and calls are end..."
1,"04/11/2024, 14:23 - ~ Calvin created group ""AL..."
2,"18/11/2024, 10:05 - You joined using this grou..."
3,"18/11/2024, 10:06 - +254 750 122958 joined usi..."
4,"18/11/2024, 10:06 - +254 794 107755 joined usi..."


## Preprocessing the Data

In [25]:
# read the data and remove irrelevant system generate messages
file_path='./WhatsApp Chat with ALX Founder Academy Kenya(Peer Community).txt'
def parse_whatsApp_data(filepath):
#    Regex pattern to define messages start with datetime 
    time_stamp_pattern=r'^\d{1,2}/\d{1,2}/\d{4}, \d{1,2}:\d{2} - '
#     regex pattern for system messages
    system_message_patterns=[
        r"Messages and calls are end-to-end encrypted",  # Encryption notice
        r"joined using this group's invite link",        # Join notices
        r"created group",                                # Group creation
        r"pinned a message",                             # Pinned messages
        r"changed this group's icon",                   # Group icon changes
        r"changed the subject",                          #Group names changes
        r' was added',
        r' . left',
        r' . added .'
        
    ]
#     read file line by 
    with open(file_path,'r',encoding='utf-8') as file:
        lines=file.readlines()
    
    messages=[]
    current_message=""
    
    for line in lines:
        #         check if line matches datetime pattern for new messages
        if re.match(time_stamp_pattern,line):
            # save current message it it's not empty and not system generated
            if current_message:
                if not any(re.search(pattern,current_message) for pattern in system_message_patterns):
                    messages.append(current_message.strip())
            #start new message-if its empty
            current_message=line.strip()
        else:
            #append contnuation lines to the current message
            current_message+=" "+line.strip()
    # add the last message (if valid)
    if current_message and not any(re.search(pattern,current_message) for pattern in system_message_patterns):
        messages.append(current_message.strip())
        
    return messages


            
parse_messages=parse_whatsApp_data(file_path)            

# for msg in parse_messages[:5]:
#     print(msg)
                

df=pd.DataFrame(parse_messages,columns=['messages'])
df.head()

Unnamed: 0,messages
0,"18/11/2024, 10:08 - +254 715 965881: Hi, The l..."
1,"18/11/2024, 10:09 - +254 111 734764: Hello, I ..."
2,"18/11/2024, 10:10 - +254 741 507177: Hey guys ..."
3,"18/11/2024, 10:11 - +254 768 270556: Group ya ..."
4,"18/11/2024, 10:12 - +254 791 186943: where can..."


In [26]:
# separate the timestamp,sender and message
def split_message_details(messages):
    pattern=r'^(\d{1,2}/\d{1,2}/\d{4}, \d{1,2}:\d{2}) - ([^:]+): (.+)$'
    
    timestamp,sender,message_only=[],[],[]
    
    for message in messages:
        match=re.match(pattern,message)
        if match:
        #if match is either timestamp,sender or message append appropriately
            timestamp.append(match.group(1))
            sender.append(match.group(2))
            message_only.append(match.group(3))
        else:
            #handle cases where message does not match pattern
            timestamp.append(None)
            sender.append(None)
            message_only.append(message)
            
        #create df with Extracted Components
        df=pd.DataFrame({
            "timestamp":timestamp,
            "sender":sender,
            "message":message_only
        })
        
    return df

result=split_message_details(df['messages'].tolist())

df1=pd.concat([df,result],axis=1)
df1.drop(columns='messages',inplace=True)

In [27]:
# drop the NAN columns that will contain missed system messages
df1.dropna(subset=['sender'],inplace=True)

df1[df1['timestamp'].isna()]

Unnamed: 0,timestamp,sender,message


## Performing Sentiment Analyis

In [28]:
# initialize the sentiment analyzer
sia=SentimentIntensityAnalyzer()

# Apply sentiment analysis to each message
df1['sentiment_score']=df1['message'].apply(lambda text:sia.polarity_scores(text))

`NLTK's VADER` provides sentiment scores such as `negative`,`neutral`,`positive`and`compound`

In [29]:
df1['sentiment_score']

0       {'neg': 0.0, 'neu': 0.815, 'pos': 0.185, 'comp...
1       {'neg': 0.094, 'neu': 0.673, 'pos': 0.233, 'co...
2       {'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp...
3       {'neg': 0.0, 'neu': 0.722, 'pos': 0.278, 'comp...
4       {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
                              ...                        
1598    {'neg': 0.0, 'neu': 0.635, 'pos': 0.365, 'comp...
1599    {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
1600    {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
1601    {'neg': 0.0, 'neu': 0.756, 'pos': 0.244, 'comp...
1602    {'neg': 0.022, 'neu': 0.834, 'pos': 0.145, 'co...
Name: sentiment_score, Length: 1595, dtype: object

In [30]:
df1['compound_score']=df1['sentiment_score'].apply(lambda score:score['compound'])
df1['positive_score']=df1['sentiment_score'].apply(lambda score:score['pos'])
df1['negative_score']=df1['sentiment_score'].apply(lambda score:score['neg'])
df1['neutral_score']=df1['sentiment_score'].apply(lambda score:score['neu'])

In [31]:
df1.head()

Unnamed: 0,timestamp,sender,message,sentiment_score,compound_score,positive_score,negative_score,neutral_score
0,"18/11/2024, 10:08",+254 715 965881,"Hi, The lady who's doing web development and a...","{'neg': 0.0, 'neu': 0.815, 'pos': 0.185, 'comp...",0.34,0.185,0.0,0.815
1,"18/11/2024, 10:09",+254 111 734764,"Hello, I joined the group last week and i feel...","{'neg': 0.094, 'neu': 0.673, 'pos': 0.233, 'co...",0.4767,0.233,0.094,0.673
2,"18/11/2024, 10:10",+254 741 507177,Hey guys who here is interested in books or a ...,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp...",0.4019,0.231,0.0,0.769
3,"18/11/2024, 10:11",+254 768 270556,Group ya Healthy snacks its me Erick 😂😂.,"{'neg': 0.0, 'neu': 0.722, 'pos': 0.278, 'comp...",0.4019,0.278,0.0,0.722
4,"18/11/2024, 10:12",+254 791 186943,where can we get the recordings for previous c...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,1.0


### Visualizing  the Sentiment Analysis Scores

### Interactive Date Time Filter

In [32]:
from ipywidgets import interact, interact_manual, widgets

In [33]:
# # set the time stamp as the index
# df1['timestamp']=pd.to_datetime(df1['timestamp'],dayfirst=True)
# df1.set_index("timestamp",inplace=True)

In [34]:
df1

Unnamed: 0,timestamp,sender,message,sentiment_score,compound_score,positive_score,negative_score,neutral_score
0,"18/11/2024, 10:08",+254 715 965881,"Hi, The lady who's doing web development and a...","{'neg': 0.0, 'neu': 0.815, 'pos': 0.185, 'comp...",0.3400,0.185,0.000,0.815
1,"18/11/2024, 10:09",+254 111 734764,"Hello, I joined the group last week and i feel...","{'neg': 0.094, 'neu': 0.673, 'pos': 0.233, 'co...",0.4767,0.233,0.094,0.673
2,"18/11/2024, 10:10",+254 741 507177,Hey guys who here is interested in books or a ...,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp...",0.4019,0.231,0.000,0.769
3,"18/11/2024, 10:11",+254 768 270556,Group ya Healthy snacks its me Erick 😂😂.,"{'neg': 0.0, 'neu': 0.722, 'pos': 0.278, 'comp...",0.4019,0.278,0.000,0.722
4,"18/11/2024, 10:12",+254 791 186943,where can we get the recordings for previous c...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,0.000,0.000,1.000
...,...,...,...,...,...,...,...,...
1598,"02/12/2024, 14:01",+254 717 966489,thank you for the feedback I will create an of...,"{'neg': 0.0, 'neu': 0.635, 'pos': 0.365, 'comp...",0.5574,0.365,0.000,0.635
1599,"02/12/2024, 14:01",+254 748 759695,Exploitation left right and centre 😂,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,0.000,0.000,1.000
1600,"02/12/2024, 14:14",+254 111 711713,STK-20240826-WA0021.webp (file attached),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,0.000,0.000,1.000
1601,"02/12/2024, 14:20",+254 717 966489,not really I would still answer with or withou...,"{'neg': 0.0, 'neu': 0.756, 'pos': 0.244, 'comp...",0.7003,0.244,0.000,0.756


In [35]:
type(pd.to_datetime('2019-01-01'))
type(df1.index)

pandas._libs.tslibs.timestamps.Timestamp

pandas.core.indexes.base.Index

In [36]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, clear_output

# Ensure the timestamp is a datetime index
df1['timestamp'] = pd.to_datetime(df1['timestamp'], dayfirst=True)
df1.set_index("timestamp", inplace=True)

# Define the function to update the chart
def range_of_messages(start_date, end_date):
    # Convert start_date and end_date to Timestamps to match the DataFrame index
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)
    
    # Use range-based slicing
    stat_df = df1.loc[start_date:end_date]
    
    sentiment_avg = {
        'Positive': stat_df['positive_score'].mean(),
        'Negative': stat_df['negative_score'].mean(),
        'Neutral': stat_df['neutral_score'].mean()
    }
    
    clear_output(wait=True)
    fig = px.pie(
        values=list(sentiment_avg.values()),
        names=list(sentiment_avg.keys()),
        title='Sentiment Distribution'
    )
    display(fig)

# Create interactive widgets
start_date_picker = widgets.DatePicker(value=df1.index.min().date(), description='Start Date')
end_date_picker = widgets.DatePicker(value=df1.index.max().date(), description='End Date')

# Attach event listeners to update the chart on change
def on_date_change(change):
    """Function to handle the change in date selection and replot the chart."""
    range_of_messages(start_date_picker.value, end_date_picker.value)

# Attach listeners to both DatePickers
start_date_picker.observe(on_date_change, names='value')
end_date_picker.observe(on_date_change, names='value')

# Display the date pickers and the initial chart
display(start_date_picker, end_date_picker)
range_of_messages(start_date_picker.value, end_date_picker.value)


DatePicker(value=datetime.date(2024, 11, 18), description='Start Date', step=1)

DatePicker(value=datetime.date(2024, 12, 2), description='End Date', step=1)

KeyError: Timestamp('2024-11-18 00:00:00')