In [1]:
import re
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt

def clean_whatsapp(whatsapp_txt):
    with open(whatsapp_txt,"r") as f:
        file = f.read()
    separation_pattern = re.compile(r'\d{2}/\d{2}/\d{2}, \d{2}:\d{2}')
    messages = pd.DataFrame(data = [re.split(separation_pattern,file)[1:],re.findall(separation_pattern,file)],index = ["text","date"])
    messages = messages.T
    prev = 0
    authors = []
    text = []
    date = []
    for i,item in messages.iterrows():
        #print(item)
        if ":" in item["text"]:
            author = item["text"].split(":")[0][3:]
            authors.append(author)
            text.append(item["text"].split(":")[1][1:-1])
            date.append(item["date"])
        else:
            authors.append("Server")
            text.append(item["text"][3:-1])
            date.append(item["date"])
    messages_cleaned = pd.DataFrame({"text":text,"author":authors,"date":pd.to_datetime(date)})
    return messages_cleaned

### NLTK cleaning

In [2]:
import nltk
import string
from nltk.corpus import stopwords

In [3]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char.lower() for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('spanish')]

In [4]:
df =  clean_whatsapp("WhatsApp Chat with Charlotte.txt")
df.head()

Unnamed: 0,text,author,date
0,Messages and calls are end-to-end encrypted. N...,Server,2020-07-24 00:06:00
1,"Comesa単a created group ""Charlotte""",Server,2019-01-04 00:15:00
2,You were added,Server,2019-01-04 00:15:00
3,Vais a ir a rivela ma単ana al final?,Evita,2020-07-23 21:35:00
4,Si tal puedo adelantar el curro y salir sobre ...,Evita,2020-07-23 21:35:00


In [5]:
#Guess author 
from sklearn.model_selection import train_test_split
from tensorflow.data import Dataset

df_guess = df[["text","author"]]
X_train, X_test, y_train, y_test = train_test_split(
    df_guess["text"].to_numpy(), df_guess["author"].to_numpy(), test_size=0.33, random_state=42)

labels_dic = {j:i for i,j in enumerate(y_train.unique())}
y_train.replace(labels_dic,inplace=True)
y_test.replace(labels_dic,inplace=True)
#train_dataset = Dataset.from_tensor_slices()

2022-07-13 17:54:39.623712: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-13 17:54:39.623747: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


AttributeError: 'numpy.ndarray' object has no attribute 'unique'

In [None]:
#Text vectorizer 
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization,Embedding
from tensorflow.keras import layers,models

text_vectorizer = TextVectorization(max_tokens = None,#how many words
                                    standardize = "lower_and_strip_punctuation",
                                    split = "whitespace",
                                    ngrams = None,
                                    output_mode = "int",
                                    output_sequence_length = None,
                                    pad_to_max_tokens = None)
text_vectorizer.adapt(df_guess["text"])
#Embedding
embedding = Embedding(input_dim = len(text_vectorizer.get_vocabulary()),
                            output_dim = 128,
                            input_length = 15)
#Model 
token_input = layers.Input(shape=(1,),dtype=tf.string)
token_vectors = text_vectorizer(token_input)
token_embeddings = embedding(token_vectors)
x = layers.Dense(1,activation="softmax")(token_embeddings)

model_guess = models.Model(inputs = token_input, outputs = x)

model_guess.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                   optimizer= tf.keras.optimizers.Adam(),
                   metrics = ["accuracy"])

In [None]:
X_train.values

In [None]:
model_guess.fit(x = X_train,
               y = y_train,
               epochs = 100)

### Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages_cleaned["text"])

In [None]:
messages_bow = bow_transformer.transform(messages_cleaned["text"])

In [None]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity,3)))

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [None]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['hierba']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['bbeas']])

### Analysis pipeline

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [None]:
msg_train, msg_test, auth_train, auth_test = train_test_split(messages_cleaned["text"], messages_cleaned["author"], test_size=0.33, random_state=42)
text_pipeline = Pipeline([
    ("Vectorize",CountVectorizer(analyzer=text_process)),
    ("tfidf",TfidfTransformer()),
    ("classifier",MLPClassifier())
])

In [None]:
text_pipeline.fit(msg_train,auth_train)

In [None]:
predictions = text_pipeline.predict(msg_test)

In [None]:
from sklearn.metrics import classification_report
print (classification_report(auth_test, predictions))