<a href="https://colab.research.google.com/github/Lindronics/WhatsApp_analysis/blob/master/WhatsApp_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WhatsApp chat protocol analysis

This notebook demonstrates a model for author classification of WhatsApp text messages.

First, if necessary, install all dependencies.

In [0]:
# !pip install pandas
# !pip install spacy
# !pip install nltk
# !pip install sklearn
!pip install eli5

In [0]:
import time
import pandas as pd

## Load WhatsApp chat protocol

Load a WhatsApp chat protocol into the notebook.
This is the raw file that gets created when exporting a conversation in WhatsApp.

From local file system...

In [0]:
# TODO

# from google.colab import files
# uploaded_files = files.upload()

# for name, file in uploaded_files.items():
#     print(name, file)

... or from Google Drive.

In [0]:
import re

from google.colab import drive
drive.mount('/content/gdrive')

path = "/content/gdrive/My Drive/Analysis/WhatsApp/"
filename = input("Name of file: ")

# Open file with specified name
raw_protocol = list()
with open(path + filename, 'r') as f:
    
    # For each line, split into timestamp, author and message body
    for line in f:
        splitted = re.compile("(.+) \- (.+?): (.*)").split(line)[1:-1]
        raw_protocol.append(splitted)

### Process into Pandas DataFrame

In [0]:
protocol = pd.DataFrame(raw_protocol)
protocol.columns = ["timestamp", "author", "body"]
protocol = protocol.dropna()
protocol = protocol.sample(frac=1).reset_index(drop=True)

# Print some information about the data
print("Size: ", protocol.size)
protocol.head(5)

In [0]:
# Print all authors
protocol.author.value_counts()

### Split into train and test

In [0]:
train_test_split = int(len(protocol) * 0.9)

train_protocol = protocol[:train_test_split]
test_protocol = protocol[train_test_split:]

train_labels = train_protocol.author.tolist()
test_labels = test_protocol.author.tolist()

## Tokenizer

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

def tokenize(s):
    return nlp(s)

def normalize(tokens):
    normalized_tokens = list()
    for token in tokens:
        if not token.is_stop and (token.is_alpha or token.is_digit):
            normalized_tokens.append(token.text.lower().strip())
    return normalized_tokens

def tokenize_normalize(s):
    return normalize(tokenize(s))

## Vectorization and Classification

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class ItemSelector(BaseEstimator, TransformerMixin):
    """Select pandas DataFrame column"""

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    
class LengthSelector(BaseEstimator, TransformerMixin):
    """Return length of values at DataFrame column"""

    def __init__(self, key):

        self.key = key
    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        lengths = np.array(data_dict[self.key].str.len())
        return lengths.reshape(-1,1)

In [0]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import KBinsDiscretizer

vectorizer_params = {
#     "tokenizer": tokenize_normalize,
    "ngram_range": (1, 3),
}

selector = Pipeline([
    ("selector", ItemSelector("body")),
])

model = Pipeline([
    ("vectorizer", TfidfVectorizer(**vectorizer_params)),
    ("classifier", LogisticRegression()),
])

pipeline = make_pipeline(selector, model)

In [0]:
t1 = time.time()

pipeline.fit(train_protocol, train_labels)

print("Elapsed: %f seconds" % (time.time() - t1))

## Evaluation

In [0]:
from sklearn.metrics import classification_report

predicted_labels = pipeline.predict(test_protocol)
print(classification_report(test_labels, predicted_labels))

In [0]:
import eli5

features = model.named_steps["vectorizer"].get_feature_names()

eli5.show_weights(model.named_steps["classifier"], vec=model.named_steps["vectorizer"], feature_names=features, top=30)

## Playground (predict an author)

In [0]:
from eli5.lime import TextExplainer
te = TextExplainer(random_state=42)

# Get message to predict
input_message = input("Message to predict: ")

# Convert to DataFrame, so it can be input into the pipeline
input_df = pd.DataFrame([input_message])
input_df.columns = ["body"]

print("This message is by %s with a probability of %f.\n" % (
    pipeline.predict(input_df)[0], 
    max(pipeline.predict_proba(input_df)[0])
))

te.fit(input_message, model.predict_proba)
te.show_prediction()