# Email category prediction

In [None]:
# import supporting functions from python file
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")

import seaborn as sns
import pandas as pd
import numpy as np

import supporting_modules.supp_functions as sfn
import string

In [None]:
# import dataset

from sklearn.datasets import fetch_20newsgroups

## Fetch the training data and the test ones

In [None]:
train_data = fetch_20newsgroups(subset="train")
test_data = fetch_20newsgroups(subset="test")

### Checking the size of each category

In [None]:
# let's look what categories we have in dataset and how many are they

f"{train_data.target_names}", "_" * 30, f"Number of categories: {len(train_data.target_names)}"

In [None]:
# show the countplot to see the differences between number of emails for each category

labels = pd.DataFrame(train_data.target, columns=["label"])
sns.set_theme(style="whitegrid")

sns.countplot(data = labels, y = "label");

In [None]:
# the differences between the categories 
# Comparison diffs between the most numerous and the least numerous group

cnt_dict = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 
            10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0}

for x in train_data.target:
    cnt_dict[x] += 1
as_list = list(cnt_dict.values())

print("count of each category:", cnt_dict)
print("--"*50)
print("smallest categories count: ", sorted(as_list)[:4])
print("highest categories: ", sorted(as_list)[-4:])
print("median = ", np.median(as_list), "| mean = ", np.mean(as_list))

In [None]:
# the differences between the categories 
# Comparison diffs between the most numerous and the least numerous group

cnt_dict = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 
            10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0}

for x in train_data.target:
    cnt_dict[x] += 1
    
as_array = np.array(list(cnt_dict.values()))

print("count of each category:\n", cnt_dict)
print("*-" * 70)
print("median = ", np.median(as_list), "| mean = ", np.mean(as_list))

### Differences between quantities per each category

As it is visible below there are some categories with several emails smaller than the mean values. These categories could be underestimated during the training of the models. The model can thus focus more on other, more numerous categories.

The "religion" category has the smallest representation in the training dataset. The difference between this one and the mean quantity is almost 200 emails. The next smallest groups are "politics" and "atheism". It may be worth considering some methods of increasing the size of these classes or changing the remaining (oversampling/undersampling).

In [None]:
quantities_sorted = np.argsort(as_array)

pd.DataFrame({  'category_name': [train_data.target_names[x] for x in quantities_sorted], 
                'quantity': [cnt_dict[x] for x in quantities_sorted]}, 
             columns = ['category_name', 'quantity'], 
             index = np.argsort(as_array))

Let's look how many emails in both datasets are (train, test). The proportions are like ~60% train / ~40% test. 

In [None]:
len(train_data.data), len(test_data.data), \
len(test_data.data) / (len(train_data.data) + len(test_data.data))

## step 1: Text data cleanup

Let's look at what the example data looks like. As it is visible there are a lot of elements that would cause the noise if they wouldn't be erased. 

In [None]:
# lets look how the emails looks like
import random

# randomly pick an example of an email and show it: 

email_no = random.randint(0, 11314)
print(email_no)
print(train_data.data[email_no])
new_txt = train_data.data[email_no]

## Remove header info

There are some constant values that repeat in a huge number of message content. They are header elements like "from: ....", "subject: ..." etc. They also have no informative value and should be rejected. It can be conducted for example like that: 

In [None]:
email_text = new_txt

# remove most common header components

def is_header_tag_in(tags: tuple, txt: str): 
    # check if header tag is in sentence or not
    for tag in tags:
        if tag in txt.lower():
            return True
    return False

header_tags = ("from:", "subject:", "lines:", "organization:", "distribution", \
               "nntp-posting-host:", "reply-to:", "keywords:", "summary:", "re:", "expires:", 
                  "article-i.d.:", "newsreader:")

# clear all rows that start with word from header_tags tuple
email_text = " ".join([x for x in email_text.split("\n") \
                       if not is_header_tag_in(header_tags, x)]).\
                strip()

In [None]:
email_text

## Email addresses inside the email content

Such "words" contain no informative value so they can be removed. Some addresses can repeat in a few or more emails and then the model could treat them as useful information which of course it isn't. 

In [None]:
import re
# [re.sub("<.*?>", "", article) for article in articles]

def clear_emails(txt: str):
    return re.sub(r"\S*@\S*\s?", "", txt)
    
cleaner_txt = clear_emails(new_txt).lower()
cleaner_txt

In [None]:
# remove digits

def clear_txt_with_digits(txt: str): 
    return re.sub(r"\S*[0-9]+\S*\s?", "", txt)

cleaner_txt = clear_txt_with_digits(cleaner_txt)
cleaner_txt

## Add these substeps to the function

In [None]:
# remove most common header components

def is_header_tag_in(tags: tuple, txt: str): 
    # check if header tag is in sentence or not
    for tag in tags:
        if tag in txt.lower():
            return True
    return False

def clear_emails(txt: str):
    return re.sub(r"\S*@\S*\s?", "", txt)

# create dict where keys are symbols from '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
# with None as vlues
table = str.maketrans({key: None for key in string.punctuation})

def clean_email_content(email_text: str, table: dict = table): 
    
    header_tags = ("from:", "subject:", "lines:", "organization:", "distribution", \
               "nntp-posting-host:", "reply-to:", "keywords:", "summary:", "re:", "expires:", 
                  "article-i.d.:", "newsreader:")
    
    # clear all rows that start with word from header_tags tuple
    email_text = " ".join([x for x in email_text.split("\n") \
                           if not is_header_tag_in(header_tags, x)]).\
                    strip()
    
    # remove e-mail addresses from content
    # then change all character to lower case
    email_text = clear_emails(email_text).lower()
    
    # clear words with digits inside
    email_text = clear_txt_with_digits(email_text)
    
    # clean whiespaces except space
    email_text = re.sub(r'[^\S ]+', '', email_text)   
    
    # email_text = email_text.translate(table)
    
    return email_text

In [None]:
clean_email_content(cleaner_txt)