<a href="https://colab.research.google.com/github/Lindronics/WhatsApp_analysis/blob/master/WhatsApp_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WhatsApp chat protocol analysis

This notebook demonstrates a model for author classification of WhatsApp text messages.

First, if necessary, install all dependencies.

In [0]:
# !pip install pandas
# !pip install spacy
# !pip install nltk
# !pip install sklearn
!pip install eli5
!pip install emoji

In [0]:
import time
import pandas as pd
from emoji import demojize

## Load WhatsApp chat protocol

Load a WhatsApp chat protocol into the notebook.
This is the raw file that gets created when exporting a conversation in WhatsApp.

From local file system...

In [0]:
# TODO

# from google.colab import files
# uploaded_files = files.upload()

# for name, file in uploaded_files.items():
#     print(name, file)

... or from Google Drive.

In [0]:
import re


from google.colab import drive
drive.mount('/content/gdrive')


path = "/content/gdrive/My Drive/Analysis/WhatsApp/"
filename = input("Name of file: ")


# Open file with specified name
raw_protocol = list()
with open(path + filename, 'r') as f:
    
    # For each line, split into timestamp, author and message body
    for line in f:
        splitted = re.compile("(.+) \- (.+?): (.*)").split(line)[1:-1]
        if len(splitted) > 0:
            splitted[-1] = demojize(splitted[-1])
            raw_protocol.append(splitted)

### Process into Pandas DataFrame

In [0]:
protocol = pd.DataFrame(raw_protocol)
protocol.columns = ["timestamp", "author", "body"]
protocol = protocol.dropna()
protocol = protocol.reset_index(drop=True)


# Print some information about the data
print("Size: ", protocol.size)
protocol.head(5)

In [0]:
# Print all authors and class balance
protocol.author.value_counts()

### Split into train and test

In [0]:
from sklearn.model_selection import train_test_split


split = 0.15
X_train, X_test, y_train, y_test = train_test_split(protocol["body"], 
                                                    protocol["author"], 
                                                    test_size=split, 
                                                    shuffle=True)

## Vectorization and Classification

### Helper class for extracting POS tags

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import spacy
    
    
class POSSelector(BaseEstimator, TransformerMixin):
    """ Extract POS tags using spaCy """
    
    
    def __init__(self):
        self.feature_names = set()
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.remove_pipe('parser')
        print("Spacy model loaded.")

        
    def fit(self, x, y=None):
        return self

    
    def transform(self, df):
        
        def get_pos(doc):
            tokens = []
            for token in self.nlp(doc):
                self.feature_names.add(token.pos_)
                tokens.append(token.pos_)
            return " ".join(tokens)
                     
        pos_tags = df.apply(get_pos)
        return pos_tags
    
    
    def get_feature_names(self):
        return list(self.feature_names)
    

### Define classification pipelines

In [0]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC


vectorizer_params = {
    "ngram_range": (1, 3),
}

token_vec = TfidfVectorizer(**vectorizer_params)
pos_vec = TfidfVectorizer()


# Define classification pipeline
model = Pipeline([
    ("features", FeatureUnion([
        ("tokens", Pipeline([
            ("vec", token_vec),
        ])),
        ("pos_tags", Pipeline([
            ("select", POSSelector()),
            ("vec", pos_vec),
        ]))
    ])),
    ("cla", LinearSVC()),
])


# Dummy model as baseline
dummy_model = Pipeline([
    ("features", FeatureUnion([
        ("tokens", Pipeline([
            ("vec", TfidfVectorizer(**vectorizer_params)),
        ])),
        ("pos_tags", Pipeline([
            ("select", POSSelector()),
            ("vec", TfidfVectorizer()),
        ]))
    ])),
    ("cla", DummyClassifier(strategy="stratified")),
])

In [0]:
t1 = time.time()


# Fit model
model.fit(X_train, y_train)


# Fit dummy model
dummy_model.fit(X_train, y_train)


print("Elapsed: %f seconds" % (time.time() - t1))

## Evaluation

In [0]:
from sklearn.metrics import classification_report


print("Test data")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


# print("Train data")
# train_predicted_labels = pipeline.predict(X_train)
# print(classification_report(y_train, train_predicted_labels))

In [0]:
from sklearn.metrics import classification_report


print("Test data")
y_pred = dummy_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [0]:
import eli5


features = token_vec.get_feature_names() + pos_vec.get_feature_names()
eli5.show_weights(model.named_steps["cla"], feature_names=features, top=40)

## Playground (predict an author)

In [0]:
from eli5.lime import TextExplainer
te = TextExplainer(random_state=42)


# Get message to predict
input_message = input("Message to predict: ")


# Convert to DataFrame, so it can be input into the pipeline
input_df = pd.Series([input_message])


print("This message is by %s with a probability of %f.\n" % (
    pipeline.predict(input_df)[0], 
    max(pipeline.predict_proba(input_df)[0])
))


te.fit(input_message, model.predict_proba)
te.show_prediction()