# Email category prediction

In [1]:
# import supporting functions from python file

import supporting_modules.supp_functions as sfn
import string

In [2]:
# import dataset

from sklearn.datasets import fetch_20newsgroups

### Fetch the training data and the test ones

In [3]:
train_data = fetch_20newsgroups(subset="train")
test_data = fetch_20newsgroups(subset="test")

In [4]:
# let's look what categories we have in dataset and how many are they

f"{train_data.target_names}", "_" * 30, f"Number of categories: {len(train_data.target_names)}"

("['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']",
 '______________________________',
 'Number of categories: 20')

In [5]:
# the differences between the categories 
# comparison diffs between the most numerous and the least numerous grpup
# różnica między naliczniejszą i najmniej liczną kategorią

cnt_dict = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 
            10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0}

for x in train_data.target:
    cnt_dict[x] += 1
as_list = list(cnt_dict.values())
print(cnt_dict) 
sum(cnt_dict.values()), sorted(as_list)[-1] - sorted(as_list)[0], sorted(as_list)[-1] - sorted(as_list)[1]

{0: 480, 1: 584, 2: 591, 3: 590, 4: 578, 5: 593, 6: 585, 7: 594, 8: 598, 9: 597, 10: 600, 11: 595, 12: 591, 13: 594, 14: 593, 15: 599, 16: 546, 17: 564, 18: 465, 19: 377}


(11314, 223, 135)

In [6]:
# lets look how many emails in both datasets are (train, test)
# the porportions are like ~60% train / ~40% test

len(train_data.data), len(test_data.data), \
len(test_data.data) / (len(train_data.data) + len(test_data.data))

(11314, 7532, 0.3996604053910644)

In [7]:
# lets look how the emails looks like
import random

# randomly pick an example of an email and show it: 

email_no = random.randint(0, 11314)
print(email_no)
print(train_data.data[email_no])
new_txt = train_data.data[email_no]

10193
From: noring@netcom.com (Jon Noring)
Subject: Re: Should Christians fight? / Justifiable war
Organization: Netcom Online Communications Services (408-241-9760 login: guest)
Lines: 35

In article gideon@otago.ac.nz (Gideon King) writes:

>I posted this a couple of weeks ago, and it doesn't seem to have appeared  
>on the newsgroup, and I haven't had a reply from the moderator. We were  
>having intermittent problems with our mail at the time. Please excuse me  
>if you have seen this before...
>
>Should Christians fight?
>
>Last week Alastair posted some questions about fighting, and whether there  
>are such things as "justifiable wars". I have started looking into these  
>things and have jotted down my findings as I go. I haven't answered all  
>his questions yet, and I know what I have here is on a slightly different  
>tack, but possibly I'll be able to get into it more deeply later, and post  
>some more info soon.

May I suggest the book: "Ethics" by Dr. Norm Geisler, of Da

In [8]:
import re
# [re.sub("<.*?>", "", article) for article in articles]

def clear_emails(txt: str):
    return re.sub(r"\S*@\S*\s?", "", txt)
    
cleaner_txt = clear_emails(new_txt).lower()

In [9]:
# remove digits

def clear_txt_with_digits(txt: str): 
    return re.sub(r"\S*[0-9]+\S*\s?", "", txt)

cleaner_txt = clear_txt_with_digits(cleaner_txt)
cleaner_txt



In [10]:
# remove most common header components

def is_header_tag_in(tags: tuple, txt: str): 
    # check if header tag is in sentence or not
    for tag in tags:
        if tag in txt.lower():
            return True
    return False

def clear_emails(txt: str):
    return re.sub(r"\S*@\S*\s?", "", txt)

# create dict where keys are symbols from '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
# with None as vlues
table = str.maketrans({key: None for key in string.punctuation})

def clean_email_content(email_text: str, table: dict = table): 
    
    header_tags = ("from:", "subject:", "lines:", "organization:", "distribution", \
               "nntp-posting-host:", "reply-to:", "keywords:", "summary:", "re:", "expires:", 
                  "article-i.d.:", "newsreader:")
    
    # clear all rows that start with word from header_tags tuple
    email_text = " ".join([x for x in email_text.split("\n") \
                           if not is_header_tag_in(header_tags, x)]).\
                    strip()
    
    # remove e-mail addresses from content
    # then change all character to lower case
    email_text = clear_emails(email_text).lower()
    
    # clear words with digits inside
    email_text = clear_txt_with_digits(email_text)
    
    # clean whiespaces except space
    email_text = re.sub(r'[^\S ]+', '', email_text)   
    
    email_text = email_text.translate(table)
    
    return email_text