#### Restart Kernel after running the cell below and then run subsequent cells

In [10]:
# Install required packages
!pip install pandas numpy scipy nltk plotly chart_studio cufflinks ipywidgets



In [11]:
# from IPython.core.display import HTML
# HTML("<script>Jupyter.notebook.kernel.restart(); setTimeout(function() { \
#     var current_cell = Jupyter.notebook.get_selected_cell(); \
#     var current_index = Jupyter.notebook.get_cell_elements().index(current_cell.element.get(0)); \
#     for (var i = current_index + 1; i < Jupyter.notebook.ncells(); i++) { \
#         Jupyter.notebook.select(i); \
#         Jupyter.notebook.execute_cell(); \
#     } \
# }, 2000);</script>")

In [12]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import scipy

# Natural Language Processing
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
import re

# Visualization
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import chart_studio.plotly as py
import cufflinks as cf

# Interactive widgets
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# Configure plotting settings
init_notebook_mode(connected=True)
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Jupyter display settings
pd.options.display.max_rows = 30
pd.options.display.max_columns = 25
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/irungu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [13]:
def parse_whatsApp_data(filepath):
    time_stamp_pattern = r'^\d{1,2}/\d{1,2}/\d{4}, \d{1,2}:\d{2} - '
    system_message_patterns = [
        r"Messages and calls are end-to-end encrypted",
        r"joined using this group's invite link",
        r"created group",
        r"pinned a message",
        r"changed this group's icon",
        r"changed the subject",
        r' was added',
        r' . left',
        r' . added .'
    ]
    
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    messages = []
    current_message = ""
    
    for line in lines:
        if re.match(time_stamp_pattern, line):
            if current_message and not any(re.search(pattern, current_message) for pattern in system_message_patterns):
                messages.append(current_message.strip())
            current_message = line.strip()
        else:
            current_message += " " + line.strip()
            
    if current_message and not any(re.search(pattern, current_message) for pattern in system_message_patterns):
        messages.append(current_message.strip())
        
    return messages

# Load and parse the data
file_path = './WhatsApp Chat with ALX Founder Academy Kenya(Peer Community).txt'
parse_messages = parse_whatsApp_data(file_path)

In [14]:
def split_message_details(messages):
    pattern = r'^(\d{1,2}/\d{1,2}/\d{4}, \d{1,2}:\d{2}) - ([^:]+): (.+)$'
    
    data = {
        'timestamp': [],
        'sender': [],
        'message': []
    }
    
    for message in messages:
        match = re.match(pattern, message)
        if match:
            data['timestamp'].append(match.group(1))
            data['sender'].append(match.group(2))
            data['message'].append(match.group(3))
        else:
            data['timestamp'].append(None)
            data['sender'].append(None)
            data['message'].append(message)
            
    return pd.DataFrame(data)

# Process messages and create DataFrame
df = split_message_details(parse_messages)
df.dropna(subset=['sender'], inplace=True)

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y, %H:%M', dayfirst=True)

df.head()

Unnamed: 0,timestamp,sender,message
0,2024-11-18 10:08:00,+254 715 965881,"Hi, The lady who's doing web development and a..."
1,2024-11-18 10:09:00,+254 111 734764,"Hello, I joined the group last week and i feel..."
2,2024-11-18 10:10:00,+254 741 507177,Hey guys who here is interested in books or a ...
3,2024-11-18 10:11:00,+254 768 270556,Group ya Healthy snacks its me Erick 😂😂.
4,2024-11-18 10:12:00,+254 791 186943,where can we get the recordings for previous c...


In [15]:
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Calculate sentiment scores
df['sentiment_scores'] = df['message'].apply(sia.polarity_scores)
df['compound_score'] = df['sentiment_scores'].apply(lambda x: x['compound'])
df['positive_score'] = df['sentiment_scores'].apply(lambda x: x['pos'])
df['negative_score'] = df['sentiment_scores'].apply(lambda x: x['neg'])
df['neutral_score'] = df['sentiment_scores'].apply(lambda x: x['neu'])

# Set timestamp as index
df.set_index('timestamp', inplace=True)

In [16]:
from IPython.display import display, HTML

def show_loading():
    return display(HTML("""
        <div style='text-align: center; margin: 20px;'>
            <p>Loading visualization...</p>
            <div class="loader" style='
                border: 4px solid #f3f3f3;
                border-radius: 50%;
                border-top: 4px solid #3498db;
                width: 40px;
                height: 40px;
                animation: spin 1s linear infinite;
                margin: auto;'>
            </div>
        </div>
        <style>
            @keyframes spin {
                0% { transform: rotate(0deg); }
                100% { transform: rotate(360deg); }
            }
        </style>
    """))

In [17]:
def range_of_messages(start_date, end_date):
    try:
        
        # Show loading indicator
        show_loading()
        # Clear previous output
        clear_output(wait=True)
        
        # Redisplay the date pickers
        display(widgets.HBox([start_date_picker, end_date_picker]))
        
        # Convert dates to timestamps and set time components
        start_date = pd.Timestamp(start_date).replace(hour=0, minute=0, second=0)
        end_date = pd.Timestamp(end_date).replace(hour=23, minute=59, second=59)
        
        # Filter data
        mask = (df.index >= start_date) & (df.index <= end_date)
        stat_df = df[mask]
        
        if len(stat_df) == 0:
            print("No data available for selected date range")
            return
        
        sentiment_avg = {
            'Positive': stat_df['positive_score'].mean(),
            'Negative': stat_df['negative_score'].mean(),
            'Neutral': stat_df['neutral_score'].mean()
        }
        
        # Create DataFrame for plotting
        plot_df = pd.DataFrame({
            'Sentiment': list(sentiment_avg.keys()),
            'Value': list(sentiment_avg.values())
        })
        
        # Add number of messages information
        message_count = len(stat_df)
        date_range_str = f'{start_date.date()} to {end_date.date()}'
        title = f'Sentiment Distribution<br>Period: {date_range_str}<br>Total Messages: {message_count}'
        
        fig = px.pie(
            plot_df,
            values='Value',
            names='Sentiment',
            title=title
        )
        
        fig.update_traces(textinfo='percent+label')
        fig.update_layout(
            title_x=0.5,  # Center the title
            showlegend=True,
            legend=dict(
                orientation="v",
                yanchor="bottom",
                y=0.5,
                xanchor="right",
                x=1.
            )
        )
        
        fig.show()
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print(f"Debug info: start_date={start_date}, end_date={end_date}")
        print(f"DataFrame date range: {df.index.min()} to {df.index.max()}")

# Create date pickers with default values from the DataFrame
default_start = df.index.min().date()
default_end = df.index.max().date()

start_date_picker = widgets.DatePicker(
    value=default_start,
    description='Start Date',
    style={'description_width': 'initial'}
)

end_date_picker = widgets.DatePicker(
    value=default_end,
    description='End Date',
    style={'description_width': 'initial'}
)

def on_date_change(change):
    if start_date_picker.value and end_date_picker.value:
        range_of_messages(start_date_picker.value, end_date_picker.value)

# Attach listeners
start_date_picker.observe(on_date_change, names='value')
end_date_picker.observe(on_date_change, names='value')

# Display widgets and initial chart
print(f"Data available from {default_start} to {default_end}")
display(widgets.HBox([start_date_picker, end_date_picker]))
range_of_messages(default_start, default_end)

HBox(children=(DatePicker(value=datetime.date(2024, 11, 18), description='Start Date', step=1, style=Descripti…

In [18]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import clear_output

def analyze_contributors():
    try:
        # Show loading indicator
        show_loading()
        
        # Create figure first
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Top 5 Contributors by Message Count',
                'Top 5 Contributors by Positive Sentiment',
                'Message Count Distribution',
                'Top 5 Contributors by Negative Sentiment'
            ),
            specs=[[{"type": "bar"}, {"type": "bar"}],
                  [{"type": "pie"}, {"type": "bar"}]]
        )
        
        # Calculate all data first
        top_contributors = df['sender'].value_counts().head(5)
        sender_sentiments = df.groupby('sender').agg({
            'positive_score': 'mean',
            'negative_score': 'mean',
            'compound_score': 'mean',
            'sender': 'count'
        }).rename(columns={'sender': 'message_count'})
        
        top_positive = sender_sentiments.nlargest(5, 'positive_score')
        top_negative = sender_sentiments.nlargest(5, 'negative_score')
        
        # Clear previous output including loading indicator
        clear_output(wait=True)
        
        # Add all traces
        fig.add_trace(
            go.Bar(x=top_contributors.index, y=top_contributors.values, name="Message Count"),
            row=1, col=1
        )
        
        fig.add_trace(
            go.Bar(x=top_positive.index, y=top_positive['positive_score'], name="Positive Score"),
            row=1, col=2
        )
        
        fig.add_trace(
            go.Pie(labels=top_contributors.index, values=top_contributors.values, name="Distribution"),
            row=2, col=1
        )
        
        fig.add_trace(
            go.Bar(x=top_negative.index, y=top_negative['negative_score'], name="Negative Score"),
            row=2, col=2
        )
        
        # Update layout
        fig.update_layout(
            height=800,
            showlegend=False,
            title_text="Chat Analysis Dashboard",
            title_x=0.5
        )
        
        # Update axes labels and traces
        fig.update_xaxes(title_text="Contributor", row=1, col=1)
        fig.update_yaxes(title_text="Number of Messages", row=1, col=1)
        fig.update_xaxes(title_text="Contributor", row=1, col=2)
        fig.update_yaxes(title_text="Positive Score", row=1, col=2)
        fig.update_xaxes(title_text="Contributor", row=2, col=2)
        fig.update_yaxes(title_text="Negative Score", row=2, col=2)
        fig.update_traces(textinfo='percent+label', selector=dict(type='pie'))
        
        # Display everything at once
        with pd.option_context('display.max_rows', None):
            fig.show()
            print("\nDetailed Statistics:")
            print("\nTop 5 Contributors by Message Count:")
            print(top_contributors)
            print("\nTop 5 Positive Contributors (Average Positive Score):")
            print(top_positive[['positive_score', 'message_count']])
            print("\nTop 5 Negative Contributors (Average Negative Score):")
            print(top_negative[['negative_score', 'message_count']])
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
# Run the analysis
analyze_contributors()


Detailed Statistics:

Top 5 Contributors by Message Count:
sender
+254 708 849873    199
+254 799 745234    112
+254 793 993063     74
+254 741 507177     55
+254 740 803454     52
Name: count, dtype: int64

Top 5 Positive Contributors (Average Positive Score):
                 positive_score  message_count
sender                                        
+254 727 059195           1.000              1
+254 724 877628           0.714              1
+254 113 554443           0.556              1
+254 701 349694           0.519              2
+254 743 286071           0.500              2

Top 5 Negative Contributors (Average Negative Score):
                 negative_score  message_count
sender                                        
+254 700 768251        0.423000              1
+254 715 405593        0.355000              1
+254 740 858839        0.333333              3
+254 742 724993        0.315000              1
+254 769 800274        0.274000              1
