In [25]:
from typing import List, Union
from tqdm import tqdm
import string
import datetime
from dateutil import parser
import multiprocessing as mp

import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split


### Data Cleaning Utilities

In [35]:
def change_date_type(dates: Union[pd.DataFrame, pd.Series]) -> List:
    """
    Formats string column into datetime object
    """
    column = []
    
    for date in dates:
        column.append(parser.parse(date).strftime("%d-%m-%Y %H:%M:%S"))
    
    series = pd.Series(column)
    return pd.to_datetime(series)


def str_to_list(row):
    """convert a string List into a List"""
    row = str(row).strip("[]").replace("'","")
    return row


def parsed_email_processing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Basic email formatting and cleaning
    """
    
    df['Date'] = change_date_type(df['Date'])
    
    df['body'] = df['body'].str.replace('\n','').str.replace('\t','')
    
    df['To'] = df['To'].astype('str')\
        .str.replace('b','')\
        .apply(str_to_list)
        
    df['From'] = df['From'].astype('str')\
        .str.replace('b','')\
        .apply(str_to_list)
    
    return df

### Text Normalization Utilities

In [None]:
def spacy_normalization_process(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    
    return filtered_sentence

In [None]:
nlp = spacy.load("en_core_web_sm")
stops = stopwords.words("english")

def _normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)


# result = test['body'].apply(_normalize, lowercase=True, remove_stopwords=True).to_frame()

In [9]:
nlp = spacy.load("en_core_web_sm")

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 nlp = nlp,
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Punctuation removal
            2. Stop words removal
            3. Lemmatization

        nlp  - spacy model
        n_jobs - parallel jobs to run
        """
        self.nlp = nlp
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self
    

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data
    
    
    def _remove_punct(self, doc):
        return (t for t in doc if t.text not in string.punctuation)
    

    def _remove_stop_words(self, doc):
        return (t for t in doc if not t.is_stop)
    

    def _lemmatize(self, doc):
        return ' '.join(t.lemma_ for t in doc)
    

    def _preprocess_text(self, text):
        doc = self.nlp(text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)
    
    
    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

### Text features

In [39]:
not_spam_df = pd.read_parquet('../data/labeled_data/nonspam.parquet.gzip')
spam_df = pd.read_parquet('../data/labeled_data/spam.parquet.gzip')

In [40]:
complete_df = pd.concat([spam_df, not_spam_df]).reset_index(drop=True)

In [41]:
complete_df = parsed_email_processing(complete_df)

In [42]:
nlp = spacy.load("en_core_web_sm")
Normalizer = TextPreprocessor(nlp, -1)
complete_df['body_transformed'] = Normalizer.transform(complete_df['body'])

In [44]:
complete_df

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,body,spam,body_transformed
0,<8307461.1075860887277.JavaMail.evans@thyme>,2004-03-02 18:14:47,"[""hotwecash@lists.adversend.com""]","[""m..presto@enron.com""]",Free Grants For Those In Need!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $Have...,1,hotwebcash turn receive loan bank poor credit ...
1,<31088330.1075860887438.JavaMail.evans@thyme>,2004-03-02 18:14:47,"[""hotwecash@lists.adversend.com""]","[""m..presto@enron.com""]",Your Opinion Counts...Win $1000!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $Your...,1,hotwebcash opinion count ... reward It!Join NP...
2,<8829953.1075860887541.JavaMail.evans@thyme>,2004-03-02 18:14:47,"[""hotwecash@lists.adversend.com""]","[""m..presto@enron.com""]",LOSE POUNDS GUARANTEED!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * $ HOTWEBCASH $ * $ * $ * $ * $...,1,hotwebcash program tailor specific needs!eat F...
3,<25503194.1075860887462.JavaMail.evans@thyme>,2004-03-02 18:14:47,"[""hotwecash@lists.adversend.com""]","[""m..presto@enron.com""]",Someone is searching for YOU!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $Find...,1,hotwebcash find match ... Click http://r1.adve...
4,<3520246.1075860887301.JavaMail.evans@thyme>,2004-03-02 18:14:47,"[""hotwecash@lists.adversend.com""]","[""m..presto@enron.com""]","Take a Survey, Win a FREE New Computer!!",,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $Comp...,1,hotwebcash complete short survey automatically...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481,<24512566.1075858195244.JavaMail.evans@thyme>,2000-05-15 00:56:00,"""mike.carson@enron.com""","""mjmoreland@aep.com""","Re: Hey, DOG",,1,text/plain; charset=us-ascii,7bit,,Mike Carson,mjmoreland@aep.com,,,\Mike_Carson_Dec2000\Notes Folders\Sent,Carson-M,mcarson2.nsf,"not much ,,,, just got back from chicago (visi...",0,get chicago visit come drink kill mule town ...
1482,<14280509.1075852831215.JavaMail.evans@thyme>,2001-10-29 17:51:00,"""arsystem@mailman.enron.com""","""m..presto@enron.com""",Your Approval is Overdue: Access Request for t...,,1,text/plain; charset=us-ascii,7bit,,ARSystem <ARSystem@mailman.enron.com>@ENRON,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,\KPRESTO (Non-Privileged)\Inbox,Presto-K,KPRESTO (Non-Privileged).pst,This request has been pending your approval fo...,0,request pende approval 3 day click http://...
1483,<3806755.1075861627055.JavaMail.evans@thyme>,2001-09-11 05:55:00,"""d..steffes@enron.com""","""richard.shapiro@enron.com""",FW: Visa,,1,text/plain; charset=us-ascii,7bit,,"Steffes, James D. </O=ENRON/OU=NA/CN=RECIPIENT...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\JSTEFFE (Non-Privileged)\Steffes, James D.\Se...",Steffes-J,JSTEFFE (Non-Privileged).pst,"FYI -----Original Message-----From: Daffin, Ma...",0,FYI -----original Message-----From Daffin Marg...
1484,<22003901.1075862106346.JavaMail.evans@thyme>,2001-11-19 12:32:00,"""e..carter@enron.com""","""monika.causholli@enron.com""",RE: Class,,1,text/plain; charset=us-ascii,7bit,,"Carter, Karen E. </O=ENRON/OU=NA/CN=RECIPIENTS...","Causholli, Monika </O=ENRON/OU=NA/CN=RECIPIENT...",,,"\MCAUSHOL (Non-Privileged)\Causholli, Monika\D...",Causholli-M,MCAUSHOL (Non-Privileged).pst,"Monika,Was this already approved by Andrea? K...",0,Monika approve Andrea Karen-----Original Mes...


For more reference about the methodology we are going to use take a look into: 
- https://towardsdatascience.com/sentence-transformer-fine-tuning-setfit-outperforms-gpt-3-on-few-shot-text-classification-while-d9a3788f0b4e
- https://github.com/pmbaumgartner/setfit

- ### Setfit Classification

- `1` stands for *spam* mails
- `0` stands for *non spam* mails

In [53]:
! pip install git+https://github.com/pmbaumgartner/setfit -q

In [54]:
from setfit import SetFitClassifier

In [45]:
complete_df['spam'].value_counts()

1    1092
0     394
Name: spam, dtype: int64

In [46]:
train_df, test_df = train_test_split(complete_df, test_size=0.2)

In [55]:
classifier = SetFitClassifier('paraphrase-MiniLM-L3-v2')

NotADirectoryError: [Errno 20] Not a directory: '/Users/luis.morales/.cache/torch'

In [51]:
! pip install -U sentence-transformers

