# AutoCorrectusing Facebook Data

## Creating vocabulary

### Import Libraries

In [1]:
import json
import re
import os
from collections import Counter

BASE_DIR = "inbox"

#### Initialize word counter

In [2]:
word_count = Counter()

#### Define text cleaner
Perform following tasks:
- Strip special escape characters
- remove most common punctuations
- Remove exra spaces
- Remove links and emojis (done in next function)

In [3]:
def clean_text(text):
    text = text.strip()
    text = re.sub("([.,!?():;])", r' ', text)
    text = text.replace("\n", "")
    return re.sub(r' +', r' ', text)

#### Read files function
Reads all the chat in the json files using the json module. After reading the file, updates the word counter for each word using the global counter.update method.

In [4]:
def read_file_content(content):
    temp_contents = []
    for m in content:
        content = m.get("content", None)
        sender = "Sahil Aggrawal"
        s = m.get("sender_name", "Default")

        if content and s == sender:
            text = content.encode('ascii', 'ignore').decode('ascii')
            html = re.sub(r"https?://.+", r"", text)
            if len(html.split()) > 2:
                temp_contents.append(clean_text(html.lower().strip()))

    all_words = [word for sent in temp_contents for word in sent.split()]
    word_count.update(all_words)

#### Create Vocabulary
Loops through all the files in the folder and read all chats creating vocabulary.

In [5]:
chat_folders = os.listdir(BASE_DIR)
for i, chat in enumerate(chat_folders):
    files = os.listdir(os.path.join(BASE_DIR, chat))
    msg_files = list(filter(lambda x: x.startswith("message"), files))
    for msg_file in msg_files:
        with open(os.path.join(BASE_DIR, chat, msg_file), "r") as f:
            data = json.load(f)
        if len(data['participants']) == 2:
            read_file_content(data['messages'])

Calculate total number of words in whole of the corpus (Not Counting Unique Occurences)

In [6]:
total_words = sum(word_count.values())

In [7]:
print("Total number of words : {}\nTotal Number of words in Vocabulary : {}".format(total_words, len(word_count)))

Total number of words : 466274
Total Number of words in Vocabulary : 20772


Calculate probability of occurrence of each word in the given corpus.

In [8]:
prob_dict = {}
for word, count in word_count.items():
    prob_dict[word] = count / total_words

In [9]:
print("Total Number of words in Probability Dictionary : {}".format(len(prob_dict)))

Total Number of words in Probability Dictionary : 20772
