In [1]:
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import csv
import sys
from datetime import datetime
sys.path.append('../ops_modules')
from parse_email import ParseMailData
from content_scanner import Scanner 

### Load the best ML model

In [2]:
# Knn
with open('../ML_algorithms/Models/best_knn.pickle', 'rb') as data:
    knn = pickle.load(data)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [3]:
knn

KNeighborsClassifier(n_neighbors=1)

In [4]:
# features_test
with open('../Pickles/features_test.pickle', 'rb') as data:
    features_test = pickle.load(data)

### Load TF-IDF model

In [5]:
with open('../Pickles/tfidf.pickle', 'rb') as data:
    tfidf = pickle.load(data)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


### Loading the test data

In [6]:
with open('Predictions/test_data/ML_data.pickle',  'rb') as data:
    ML_test_data = pickle.load(data)

[Test Data](#test)


In [7]:
ML_test_data.iloc[:1]

Unnamed: 0,Subject,Body,Day,Month,Year,Date,Content_Parsed_1,parsed_lemmatized_text,stop_words_parsed
0,sa direct enrolled ||stage||health service con...,"Hi Team,\r\n\r\n \r\n\r\nCould you please make...",14,Aug,2020,2020-08-14,hi team could you please make the health co...,hi team could you please make the health co...,hi team could please make health configur...


In [8]:
df = pd.read_csv('test_data/test_data.CSV', encoding="ISO-8859-1")
df = df.fillna('dummy')

<a id="test"></a>
### Test Data

In [9]:
parse_obj = ParseMailData('test_data/test_data.CSV')
parse_df = parse_obj.parse()
current_year = datetime.now().year
current_month = datetime.now().strftime('%h')
parse_df.iloc[:1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Subject'][index] = subject + " " + str(random())


Unnamed: 0,Subject,Body,Day,Month,Year,Date
0,re: question about api standards,Thanks John.\r\n\r\n \r\n\r\nWe will review th...,14,Sep,2021,2021-09-14


In [13]:
parse_df_current = parse_df.loc[(parse_df['Month'] == (pd.Period(datetime.now(), 'M') - 1).strftime('%b')) & (parse_df['Year'] == current_year)]
parse_df_current = parse_df_current.reset_index(drop=True)
parse_df_current.iloc[:2]

Unnamed: 0,Subject,Body,Day,Month,Year,Date
0,re: supported cards vs supported payments,"Yes Thatâs correct Charlie, he wants to chan...",1,Sep,2022,2022-09-01
1,"creditor management, creditor retrieval - refe...","Hi, \r\n\r\n \r\n\r\nOn the FAQs of the Self-A...",1,Sep,2022,2022-09-01


[Predictions](#predictions)

### Feature engineering: Data pre-processing

In [14]:
def feature_creation(content_parsed):
    # Removing \r \n and extra spaces
    content_parsed['Content_Parsed_1'] = content_parsed['Body'].str.replace("\r", " ")
    content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.replace("\n", " ")
    content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.replace("    ", " ")
    # content_parsed['Category'] = category_subject_df['Category']

    # Removing " when quoting text
    content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.replace('"', '')

    # Lowercasing the text
    content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.lower()

    # Removing common non-relevant occuring words
    ignore_words = ['mastercard', 'com', 'senior', 'software', 'engineer', 'mountainview', 'central', 'park', 'leopardstown',
                    'dublin', '18', 'ireland', 'cc', 'subject', 'mailto', 'api_consultancy_and_standards', 'api_onboarding']
    for ig_word in ignore_words:
        content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.replace(ig_word, ' ')

    # Removing punctuation signs and other unwanted symbols
    punctuation_signs = list("?:!.,;<>|@")

    for punct_sign in punctuation_signs:
        content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.replace(punct_sign, ' ')

    # Removing possessive nouns
    content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.replace("'s", " ")
    
    ##### Lemmatization #####
    # Saving the lemmatizer into an object
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_text_list = []

    for row in range(0, len(content_parsed)):

        # Create an empty list containing lemmatized words
        lemma_list = []

        # Save the text and its words into an object
        text = content_parsed.loc[row]['Content_Parsed_1']
        text_words = text.split(" ")

        # Iterate through every word to lemmatize
        for word in text_words:
            lemma_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

        # Join the list
        lemma_text = " ".join(lemma_list)

        # Append to the list containing the texts
        lemma_text_list.append(lemma_text)

    content_parsed['parsed_lemmatized_text'] = lemma_text_list
    
    ##### Stop words removal #####
    stop_words = list(stopwords.words('english'))
    # Adding the stopwords from the TO: From: CC: column
    # which is all the names

    stop_words_df = pd.DataFrame({})
    stop_words_df['From: (Name)'] = df['From: (Name)']
    stop_words_df['To: (Name)'] = df['To: (Name)']
    stop_words_df['CC: (Name)'] = df['CC: (Name)']
    #stop_words_df = df_distinct.iloc[:, :2]

    for column in stop_words_df:
        # Lowercasing the text
        stop_words_df[column] = stop_words_df[column].str.lower()

        # Removing punctuation signs and other unwanted symbols
        stop_words_df[column] = stop_words_df[column].str.replace('dummy', '')

        # Removing punctuation signs and other unwanted symbols
        stop_words_punctuation_signs = list(",;)(")

        for stop_words_punct_sign in stop_words_punctuation_signs:
            stop_words_df[column] = stop_words_df[column].str.replace(stop_words_punct_sign, ' ')
            
    word = list()
    for i in range(0, len(stop_words_df)):
        for j in stop_words_df.loc[i].values:
            word.append(j.split())
    new_stop_words = [item for sublist in word for item in sublist]


    stop_words_unique = set(new_stop_words) # To get unique values
    
    stop_words.extend(stop_words_unique)
    remove_words = ['jamstack', 'onboarding', 'support', 'product', 'operations', 'api', 'apis', 'project', 
                    'architecture', 'security', 'development', 'key', 'jenkins', 'dev', 'external', 'team', 'digital',
                    'helpdesk', 'axon', 'gateway', 'xmlgw', 'access', 'ping', 'strategic', 'developers', 'postgres',
                    'management', 'xml', 'gw', 'service', 'dba', 'standards']

    for w in remove_words:
        if w in stop_words:
            stop_words.remove(w)
        
    # Takes time to process 5-6 mins.
    # This is to remove all the stopwords from the Body.
    content_parsed['stop_words_parsed'] = content_parsed['parsed_lemmatized_text']
    for stop_word in stop_words:
        if (stop_word == '?ukasz'):
            stop_word = '\?ukasz'
        regex_stopword = r"\b" + stop_word + r"\b"
        content_parsed['stop_words_parsed'] = content_parsed['stop_words_parsed'].str.replace(regex_stopword, '')

    # Removing the unwanted columns
    content_parsed = content_parsed.drop(['Content_Parsed_1', 'parsed_lemmatized_text'], axis=1)
    
    # Renaming the parsed column
    content_parsed = content_parsed.rename(columns={'stop_words_parsed': 'Content_Parsed'})
    
    # TF-IDF
    features = tfidf.transform(content_parsed['Content_Parsed']).toarray()
    
    return features, content_parsed


In [15]:
def get_category_name(category_id):
    category_codes = {'1' : 'Service Proxy troubleshooting / APIGW', 
                      '2' : 'Onboarding generic queries',
                      '3' : 'Assessment/rescore queries/early spec/exception requests',
                      '4' : 'Access to Tool queries', 
                      '5' : 'API Standards queries',
                      '6' : 'zally',
                      '7' : 'Client libs', 
                      '8' : 'Jamstack content reviewer',
                      '9' : 'Axon Queries',
                      '10': 'Mastercard Developers Notification'
                     }
    for cid, cname  in category_codes.items():    
        if cid == category_id:
            return cname

### Prediction from features

In [16]:
def predict_from_features(features):
        
    # Obtain the highest probability of the predictions for each mail
    predictions_proba = knn.predict_proba(features).max(axis=1)    
    
    # Predict using the input model
    predictions_pre = knn.predict(features)

    # Replace prediction with 6 if associated cond. probability less than threshold
    predictions = []

    for prob, cat in zip(predictions_proba, predictions_pre):
        if prob > .65:
            predictions.append(cat)
        else:
            predictions.append(5)

    # Return result
    categories = [get_category_name(x) for x in predictions]
    
    return categories

In [17]:
def complete_df(df, categories):
    df['Prediction'] = categories
    return df

In [18]:
# Features creation
features, df_show_info = feature_creation(parse_df_current)

  content_parsed['Content_Parsed_1'] = content_parsed['Content_Parsed_1'].str.replace(punct_sign, ' ')
  stop_words_df[column] = stop_words_df[column].str.replace(stop_words_punct_sign, ' ')
  content_parsed['stop_words_parsed'] = content_parsed['stop_words_parsed'].str.replace(regex_stopword, '')


In [19]:
df_show_info.iloc[:2]

Unnamed: 0,Subject,Body,Day,Month,Year,Date,Content_Parsed
0,re: supported cards vs supported payments,"Yes Thatâs correct Charlie, he wants to chan...",1,Sep,2022,2022-09-01,yes thatâ correct want change header s...
1,"creditor management, creditor retrieval - refe...","Hi, \r\n\r\n \r\n\r\nOn the FAQs of the Self-A...",1,Sep,2022,2022-09-01,hi faqs selfassessment tutorial confl...


In [20]:
ML_test_data = pd.concat([ML_test_data, parse_df_current], ignore_index=True)
ML_test_data = ML_test_data.drop_duplicates(subset=['Subject']).reset_index(drop=True)
ML_test_data.iloc[:2]

Unnamed: 0,Subject,Body,Day,Month,Year,Date,Content_Parsed_1,parsed_lemmatized_text,stop_words_parsed
0,sa direct enrolled ||stage||health service con...,"Hi Team,\r\n\r\n \r\n\r\nCould you please make...",14,Aug,2020,2020-08-14,hi team could you please make the health co...,hi team could you please make the health co...,hi team could please make health configur...
1,re: track bps : axon topic creation request fo...,[Attaching Splunk output]\r\n\r\n \r\n\r\nRoss...,17,Aug,2020,2020-08-17,[attaching splunk output] ross phelan ...,[attaching splunk output] ross phelan ...,[attaching splunk output] api pl...


<a id="predictions"></a>
### Predictions

In [21]:
# Predict
predictions = predict_from_features(features)

In [22]:
predictions

['API Standards queries',
 'API Standards queries',
 'API Standards queries',
 'Assessment/rescore queries/early spec/exception requests',
 'API Standards queries']

In [23]:
# Put into dataset
df_predictions_current = complete_df(df_show_info, predictions)
df_predictions_current.iloc[:2]

Unnamed: 0,Subject,Body,Day,Month,Year,Date,Content_Parsed,Prediction
0,re: supported cards vs supported payments,"Yes Thatâs correct Charlie, he wants to chan...",1,Sep,2022,2022-09-01,yes thatâ correct want change header s...,API Standards queries
1,"creditor management, creditor retrieval - refe...","Hi, \r\n\r\n \r\n\r\nOn the FAQs of the Self-A...",1,Sep,2022,2022-09-01,hi faqs selfassessment tutorial confl...,API Standards queries


In [24]:
# Appending the current predictions with the previous data
with open('Predictions/test_data/knn_test.pickle', 'rb') as data:
    previous_data = pickle.load(data)

In [25]:
previous_data.iloc[:2]

Unnamed: 0,Subject,Body,Day,Month,Year,Date,Content_Parsed,Prediction
0,sa direct enrolled ||stage||health service con...,"Hi Team,\r\n\r\n \r\n\r\nCould you please make...",14,Aug,2020,2020-08-14,hi team could please make health configur...,API Standards queries
1,re: track bps : axon topic creation request fo...,[Attaching Splunk output]\r\n\r\n \r\n\r\nRoss...,17,Aug,2020,2020-08-17,[attaching splunk output] api pl...,Onboarding generic queries


In [26]:
# Appending the current predictions with the previous data
total_data = pd.concat([previous_data, df_predictions_current], ignore_index=True)
total_data = total_data.drop_duplicates(subset=['Subject'])
total_data = total_data.reset_index(drop=True)
total_data.iloc[:2]

Unnamed: 0,Subject,Body,Day,Month,Year,Date,Content_Parsed,Prediction
0,sa direct enrolled ||stage||health service con...,"Hi Team,\r\n\r\n \r\n\r\nCould you please make...",14,Aug,2020,2020-08-14,hi team could please make health configur...,API Standards queries
1,re: track bps : axon topic creation request fo...,[Attaching Splunk output]\r\n\r\n \r\n\r\nRoss...,17,Aug,2020,2020-08-17,[attaching splunk output] api pl...,Onboarding generic queries


In [27]:
# Saving ML_test_data with updated values in df pickle file
with open('Predictions/test_data/ML_data.pickle', 'wb') as output:
    pickle.dump(ML_test_data, output)

In [28]:
# Saving predicted values in df pickle file    
with open('Predictions/test_data/knn_test.pickle', 'wb') as output:
    pickle.dump(total_data, output)