<a href="https://colab.research.google.com/github/Lindronics/WhatsApp_analysis/blob/master/WhatsApp_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WhatsApp chat protocol analysis


In [0]:
import time

## Load WhatsApp chat protocol

In [0]:
# from google.colab import files
# uploaded_files = files.upload()

### Load from Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import re

filename = input("Name of file: ")
raw_protocol = list()
with open('/content/gdrive/My Drive/Analysis/WhatsApp/%s' % filename, 'r') as f:
    for line in f:
        splitted = re.compile("(.+) \- (.+?): (.*)").split(line)[1:-1]
        raw_protocol.append(splitted)

### Process into Pandas DataFrame

In [0]:
import pandas as pd

protocol = pd.DataFrame(raw_protocol)
protocol.columns = ["timestamp", "author", "body"]
protocol = protocol.dropna()

print("Size: ", protocol.size)
protocol.head(5)

In [0]:
protocol.author.value_counts()

### Split into train and test

In [0]:
train_test_split = int(len(protocol) * 0.8)

train_protocol = protocol[:train_test_split]
test_protocol = protocol[train_test_split:]

train_labels = train_protocol.author.tolist()
test_labels = test_protocol.author.tolist()

## Tokenize

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

def tokenize(s):
    return nlp(s)

def normalize(tokens):
    normalized_tokens = list()
    for token in tokens:
        if token.is_alpha or token.is_digit:
            try:
                normalized_tokens.append(token.text.lower().strip())
            except:
                print(token)
    return normalized_tokens

def tokenize_normalize(s):
    return normalize(tokenize(s))

## Pipeline

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ("selector", ItemSelector("body")),
    ("vectorizer", TfidfVectorizer()),
    ("classifier", LogisticRegression()),
])

In [0]:
t1 = time.time()

pipeline.fit(train_protocol, train_labels)

print("Elapsed: %f seconds" % (time.time() - t1))

## Evaluation

In [0]:
from sklearn.metrics import classification_report

predicted_labels = pipeline.predict(test_protocol)
print(classification_report(test_labels, predicted_labels))

## Playground (predict an author)

In [0]:
# Get message to predict
input_message = input("Message to predict: ")

# Convert to DataFrame, so it can be input into the pipeline
input_df = pd.DataFrame([input_message])
input_df.columns = ["body"]

print("This message is by %s with a probability of %f." % (
    pipeline.predict(input_df)[0], 
    max(pipeline.predict_proba(input_df)[0])
))