# AutoCorrectusing Facebook Data

## Creating vocabulary

### Import Libraries

In [1]:
import json
import re
import os
from collections import Counter

BASE_DIR = "inbox"

#### Define text cleaner
Perform following tasks:
- Strip special escape characters
- remove most common punctuations
- Remove exra spaces
- Remove links and emojis (done in next function)

In [2]:
def clean_text(text):
    text = text.strip().lower()
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r"https?://.+", r"", text)
    text = re.sub("([.,!?():;])", r' ', text)
    text = text.replace("\n", "")
    return re.sub(r' +', r' ', text)

In [3]:
chat_folders = os.listdir(BASE_DIR)

all_sentences = []

for i, chat in enumerate(chat_folders):
    files = os.listdir(os.path.join(BASE_DIR, chat))
    msg_files = list(filter(lambda x: x.startswith("message"), files))
    for msg_file in msg_files:
        with open(os.path.join(BASE_DIR, chat, msg_file), "r") as f:
            data = json.load(f)
        if len(data['participants']) == 2:

            msgs = data['messages']

            for msg in msgs:
                if msg.get('content', None) and msg['sender_name'] == 'Sahil Aggrawal':
                    sentence = clean_text(msg['content']).split()
                    if len(sentence) > 3:
                        all_sentences.append(sentence)

    if (i + 1) % 60 == 0 :
        print(f"Processed {i+1} chats")

Processed 60 chats
Processed 120 chats
Processed 180 chats
Processed 240 chats
Processed 300 chats
Processed 360 chats
Processed 420 chats
Processed 480 chats
Processed 540 chats
Processed 600 chats
Processed 660 chats


In [4]:
len(all_sentences)

53222

In [5]:
import random
random.seed(101)

random.shuffle(all_sentences)

train_size = int(len(all_sentences) * 0.8)

train_data = all_sentences[:train_size]
test_data = all_sentences[train_size:]

In [6]:
print(f"Length of Train set : {len(train_data)}\nLength of Test set : {len(test_data)}")

Length of Train set : 42577
Length of Test set : 10645


In [7]:
word_count = Counter()

for sent in train_data:
    word_count.update(sent)

In [8]:
vocab = dict(word_count.most_common(5000))

In [9]:
def preprocess(data, vocab):
    processed_sent = []
    for i, sent in enumerate(data):
        s = []
        for j, token in enumerate(sent):
            if token not in vocab.keys():
                s.append("<UNK>")
            else:
                s.append(token)

        processed_sent.append(s)

    return processed_sent

In [10]:
train = preprocess(train_data, vocab)
test = preprocess(test_data, vocab)

In [11]:
import pickle

with open("data_and_vocab.p", "wb") as f:
    pickle.dump([train, test, vocab], f)