# Global Configurations

In [1]:
# -*- coding: utf-8 -*-

In [2]:
# enable functionalities
%config IPCompleter.greedy=True  # auto-complete

In [3]:
# import the dependencies
import os

In [4]:
# define the global variables
BASE_DIR = os.path.abspath('')
DATA_DIR = os.path.join(BASE_DIR, 'data')
REQUESTS_PATH = os.path.join(DATA_DIR5, 'General Request Data.csv')
AMAZON_REVIEW_PATH = os.path.join(DATA_DIR, 'AmazonReview.csv')
SPACY_PROCESSED_PATH = os.path.join(DATA_DIR, 'spacy_processed.pickle')
REQUEST_DATA_PATH = os.path.join(DATA_DIR, 'clean_requests.pickle')

# Text Cleaning

In [6]:
# import the dependencies
import pandas as pd
import re
import copy
from urlextract import URLExtract
import numpy as np
import string
import pickle

In [8]:
# read the data from the CSV file
data = pd.read_csv(REQUESTS_PATH)

# view the stats of the dataframe
print("There are", data.shape[0], "rows and", data.shape[1], "coloumns in the data set.")
print("The columns are:", list(data.columns))
data.head()

There are 92541 rows and 12 coloumns in the data set.
The columns are: ['closed_at', 'opened_at', 'parent', 'number', 'assignment_group', 'assigned_to', 'priority', 'request_item.cat_item', 'state', 'short_description', 'u_description', 'description']


Unnamed: 0,closed_at,opened_at,parent,number,assignment_group,assigned_to,priority,request_item.cat_item,state,short_description,u_description,description
0,1/11/19 10:43,1/7/19 3:43,RITM0601256,TASK0732614,PSG.CTD.IS.FCS Service Desk,Ivan Dos Santos,4 - Low,General Request,Closed Complete,General Request: Acceso a Internet - Estudio C...,,Se necesita acceso a internet para el siguient...
1,6/26/17 23:50,6/22/17 0:45,RITM0226606,TASK0252360,ETO.EES.SD.ATS Support,Zhang Chunlin,4 - Low,General Request,Closed Complete,General Request: request to forward email to s...,,"hi, \r\nwould like to request to foward CheeCh..."
2,5/16/16 4:21,5/2/16 0:52,RITM0050286,TASK0051491,LSG.BID.DS.LT Vilnius,Andrius Stirblys,4 - Low,General Request,Closed Complete,General Request,,Prašau priskirti šią užduotį Lithuania.Vilnius...
3,5/3/19 14:22,5/3/19 12:01,RITM0715674,TASK0887223,Service Desk,,4 - Low,General Request,Closed Incomplete,General Request: Materila should be removed fr...,,The material PN 76157 - Receipt ID DR320474 - ...
4,7/8/19 5:50,6/29/19 19:08,RITM0772088,TASK0949305,LATAM.APPLICATION,Marcos Betto,4 - Low,General Request,Closed Complete,General Request: INCLUIR LOTES EM PEDIDOS FATU...,,"Boa noite Andrei, poderia por favor alocar man..."


In [300]:
# # use spacy nlp libary to process the raw review texts
# nlp = spacy.load('en')
# precessed_request_texts = [nlp(text) for text in request_texts]

# # save the results
# with open(SPACY_PROCESSED_PATH, 'wb') as f:
#     pickle.dump(precessed_request_texts, f)
# f.close()

In [9]:
# get all the request texts from the data set
request_short_descriptions = []
request_long_descriptions = []
request_full_descriptions = []
invalid_count = 0
for index, row in data.iterrows():
    row_text = ''
    # if the description section is NaN, then record an empty string
    if type(row['short_description']) == str:
        row_text += row['short_description'] + ' '
        request_short_descriptions.append(row['short_description'])
    else:
        request_short_descriptions.append('')
    if type(row['description']) == str:
        row_text += row['description']
        request_long_descriptions.append(row['description'])
    else:
        request_short_descriptions.append('')    
    request_full_descriptions.append(row_text)
    
# preview the request_texts
display(pd.DataFrame({'full description':request_full_descriptions[:15]}))

# count the number of characters
def count_chars(string_array, array_name):
    character_count = 0
    for text in string_array:
        character_count += len(text)
    print("The character number of", array_name, "is", character_count)
    
count_chars(request_short_descriptions, "short descriptions")
count_chars(request_long_descriptions, "long descriptions")
count_chars(request_full_descriptions, "full descriptions")

Unnamed: 0,full description
0,General Request: Acceso a Internet - Estudio C...
1,General Request: request to forward email to s...
2,General Request Prašau priskirti šią užduotį L...
3,General Request: Materila should be removed fr...
4,General Request: INCLUIR LOTES EM PEDIDOS FATU...
5,Request to remove SNOW Subcategory Request Gen...
6,WIN 10 REIMAGE - Junwei Su Need to upgrade my ...
7,General Request: can you please extend GL code...
8,General Request: Unipoint uploads Good afterno...
9,General Request: need computer connected to la...


The character number of short descriptions is 5501687
The character number of long descriptions is 20766382
The character number of full descriptions is 26360593


In [10]:
# erase non English requests
def removeNonEnglish(string_array):
    english_strings = []
    for string in string_array:
        try:
            string.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            english_strings.append('')
        else:
            english_strings.append(string)
    display(pd.DataFrame({'before':string_array, 'after':english_strings})[:15])
    return english_strings

#short_descriptions_Engish = removeNonEnglish(request_short_descriptions)
#long_descriptions_Engish = removeNonEnglish(request_long_descriptions)
full_descriptions_Engish = removeNonEnglish(request_full_descriptions)

Unnamed: 0,before,after
0,General Request: Acceso a Internet - Estudio C...,General Request: Acceso a Internet - Estudio C...
1,General Request: request to forward email to s...,General Request: request to forward email to s...
2,General Request Prašau priskirti šią užduotį L...,
3,General Request: Materila should be removed fr...,General Request: Materila should be removed fr...
4,General Request: INCLUIR LOTES EM PEDIDOS FATU...,General Request: INCLUIR LOTES EM PEDIDOS FATU...
5,Request to remove SNOW Subcategory Request Gen...,Request to remove SNOW Subcategory Request Gen...
6,WIN 10 REIMAGE - Junwei Su Need to upgrade my ...,WIN 10 REIMAGE - Junwei Su Need to upgrade my ...
7,General Request: can you please extend GL code...,General Request: can you please extend GL code...
8,General Request: Unipoint uploads Good afterno...,General Request: Unipoint uploads Good afterno...
9,General Request: need computer connected to la...,General Request: need computer connected to la...


In [11]:
# drop the empty rows
def drop_empty(string_array, array_name):
    count = 0
    no_empty_string = []
    for text in string_array:
        if len(text):
            no_empty_string.append(text)
        else:
            count += 1
    print("lost", count, '(', count/len(string_array)*100, "%)", "non English rows in", array_name)
    display(pd.DataFrame({'empty strings are removed':no_empty_string})[:15])
    return no_empty_string

#short_descriptions_Engish = drop_empty(short_descriptions_Engish, "short descriptions")
#long_descriptions_Engish = drop_empty(long_descriptions_Engish, "long descriptions")
full_descriptions_Engish = drop_empty(full_descriptions_Engish, "full descriptions")

# # compare with Amazon review data
# Amazon_review_data = pd.read_csv(AMAZON_REVIEW_PATH, usecols=[16])
# Amazon_review_data.head()

lost 14899 ( 16.09989085918674 %) non English rows in full descriptions


Unnamed: 0,empty strings are removed
0,General Request: Acceso a Internet - Estudio C...
1,General Request: request to forward email to s...
2,General Request: Materila should be removed fr...
3,General Request: INCLUIR LOTES EM PEDIDOS FATU...
4,Request to remove SNOW Subcategory Request Gen...
5,WIN 10 REIMAGE - Junwei Su Need to upgrade my ...
6,General Request: can you please extend GL code...
7,General Request: Unipoint uploads Good afterno...
8,General Request: need computer connected to la...
9,General Request: Report extraction from TRAX P...


In [12]:
regex = re.compile(r"()")
re.sub(regex, ' # ', "''")
# for text in full_descriptions_ready:
#     if 'ustew-ftn8n02' in text:
#         print(text)


" # ' # ' # "

In [13]:
# remove unwanted characters
# ejm: email
# ljk: URL/link
# ajn: alphanumerics
# ljg: long word
# njm: number
# djt: date
# pjt: path
def clean(string_array, array_name):
    regex_email = re.compile(r'[\w\.-]+@[\w\.-]+')
    regex_general = re.compile(r"^general request:?\s*", re.IGNORECASE)
    regex_hashcode = re.compile(r'( [a-zA-z-]+[0-9]+[a-zA-z0-9-]* | [0-9]+[a-zA-z-]+[a-zA-z0-9-]* )')
    regex_long = re.compile(r'[a-zA-z-]{20,}')
    regex_num = re.compile(r'[0-9]+')
    regex_date = re.compile(r'( [0-9]{4}[./-]+[0-9]{2}[./-]+[0-9]{2} | [0-9]{2}[./-]+[0-9]{2}[./-]+[0-9]{4} )')
    regex_path = re.compile(r" [a-zA-Z_.:']*(\\|/){1,2}[a-zA-Z_.]+(\\|/){1,2}[a-zA-Z_.]+(\\|/){1,2}[a-zA-Z_.\\/']* ")
    
    regex_punc = re.compile(r'[#~`^!"*,;?()<>{}\[\]]')
    regex_space = re.compile(r'\s\s+|^\s+|\s+$')
    regex_dash = re.compile(r'--+')
    regex_slash = re.compile(r'[-+=_%.:\\/@\$\|]')
    regex_quote = re.compile(r"('')")
    regex_ejm = re.compile(r'( ejm )(ejm )+')
    regex_ljk = re.compile(r'( ljk )(ljk )+')
    regex_ajn = re.compile(r'( ajn )(ajn )+')
    regex_ljg = re.compile(r'( ljg )(ljg )+')
    regex_njm = re.compile(r'( njm )(njm )+')
    regex_djt = re.compile(r'( djt )(djt )+')
    regex_pjt = re.compile(r'( pjt )(pjt )+')
    
    url_extractor = URLExtract()
    string_array_cleaned = copy.deepcopy(string_array)
    for index, string in enumerate(string_array_cleaned):
        # get rid of tabs(\t), returns(\r), and newlines(\n)
        string_array_cleaned[index] = string.translate({ord(i): ' ' for i in '\n\t\r'})
        
        # delete "General Request: "
        string_array_cleaned[index] = re.sub(regex_general, '', string_array_cleaned[index])
        
        # replace all the email address with ejm
        string_array_cleaned[index] = re.sub(regex_email, ' ejm ', string_array_cleaned[index])
        
        # substitute urls with 'ljk'
        urls = url_extractor.find_urls(string_array_cleaned[index])
        for url in urls:
            string_array_cleaned[index] = string_array_cleaned[index].replace(url, ' ljk ')
            
        # drop the punctuations
        string_array_cleaned[index] = re.sub(regex_punc, ' ', string_array_cleaned[index])
        
        # convert to lower case
        string_array_cleaned[index] = string_array_cleaned[index].lower()
        
        # substitute date code with 'djt'
        string_array_cleaned[index] = re.sub(regex_date, ' djt ', string_array_cleaned[index])
        
        # substitute path with 'pjt'
        string_array_cleaned[index] = re.sub(regex_path, ' pjt ', string_array_cleaned[index])
        
        # substitute alphanumerical words(e.g. ab0xec) with ajn
        string_array_cleaned[index] = re.sub(regex_hashcode, ' ajn ', string_array_cleaned[index])
        
        # substitute numbers with 'njm'
        string_array_cleaned[index] = re.sub(regex_num, ' njm ', string_array_cleaned[index])
        
        # substitute long words with 'ljg'
        string_array_cleaned[index] = re.sub(regex_long, ' ljg ', string_array_cleaned[index])
        
        # drop slash
        string_array_cleaned[index] = re.sub(regex_slash, ' ', string_array_cleaned[index])
        
        # drop all the trailing whitespaces( ) and dashes(-)
        string_array_cleaned[index] = re.sub(regex_space, ' ', string_array_cleaned[index])
        string_array_cleaned[index] = re.sub(regex_dash, '-', string_array_cleaned[index])
        
        # drop ''
        string_array_cleaned[index] = re.sub(regex_quote, ' ', string_array_cleaned[index])
        
        # reduce the number of tokens
        string_array_cleaned[index] = re.sub(regex_ejm, ' ejm ', string_array_cleaned[index])
        string_array_cleaned[index] = re.sub(regex_ljk, ' ljk ', string_array_cleaned[index])
        string_array_cleaned[index] = re.sub(regex_ajn, ' ajn ', string_array_cleaned[index])
        string_array_cleaned[index] = re.sub(regex_ljg, ' ljg ', string_array_cleaned[index])
        string_array_cleaned[index] = re.sub(regex_njm, ' njm ', string_array_cleaned[index])
        string_array_cleaned[index] = re.sub(regex_djt, ' djt ', string_array_cleaned[index])
        string_array_cleaned[index] = re.sub(regex_pjt, ' pjt ', string_array_cleaned[index])
    #     # replace \u3000 with whitespace
    #     request_short_descriptions_cleaned[index] = request_short_descriptions_cleaned[index].replace('\u3000', ' ')
    # display the processed data
    print('The character number of cleaned', array_name, 'is', sum([len(text) for text in string_array_cleaned]))
    display(pd.DataFrame({'before':string_array, 'after':string_array_cleaned})[:15])
    return string_array_cleaned
        
# call the clean function to clean the texts for times
#request_short_descriptions_cleaned = copy.deepcopy(request_short_descriptions)
#request_long_descriptions_cleaned = copy.deepcopy(request_long_descriptions)
request_full_descriptions_cleaned = copy.deepcopy(full_descriptions_Engish)
for i in range(5):
    #request_short_descriptions_cleaned = clean(request_short_descriptions_cleaned, "short descriptions")
    #request_long_descriptions_cleaned = clean(request_long_descriptions_cleaned, "long descriptions")
    request_full_descriptions_cleaned = clean(request_full_descriptions_cleaned, "full descriptions")

The character number of cleaned full descriptions is 16835957


Unnamed: 0,before,after
0,General Request: Acceso a Internet - Estudio C...,acceso a internet estudio contable argentina s...
1,General Request: request to forward email to s...,request to forward email to shared mailbox hi ...
2,General Request: Materila should be removed fr...,materila should be removed from gpm the materi...
3,General Request: INCLUIR LOTES EM PEDIDOS FATU...,incluir lotes em pedidos faturados boa noite a...
4,Request to remove SNOW Subcategory Request Gen...,request to remove snow subcategory request gen...
5,WIN 10 REIMAGE - Junwei Su Need to upgrade my ...,win njm reimage junwei su need to upgrade my l...
6,General Request: can you please extend GL code...,can you please extend gl code njm to plant njm...
7,General Request: Unipoint uploads Good afterno...,unipoint uploads good afternoon when i attach ...
8,General Request: need computer connected to la...,need computer connected to landline can you pl...
9,General Request: Report extraction from TRAX P...,report extraction from trax pr screen please a...


The character number of cleaned full descriptions is 16835137


Unnamed: 0,before,after
0,acceso a internet estudio contable argentina s...,acceso a internet estudio contable argentina s...
1,request to forward email to shared mailbox hi ...,request to forward email to shared mailbox hi ...
2,materila should be removed from gpm the materi...,materila should be removed from gpm the materi...
3,incluir lotes em pedidos faturados boa noite a...,incluir lotes em pedidos faturados boa noite a...
4,request to remove snow subcategory request gen...,request to remove snow subcategory request gen...
5,win njm reimage junwei su need to upgrade my l...,win njm reimage junwei su need to upgrade my l...
6,can you please extend gl code njm to plant njm...,can you please extend gl code njm to plant njm...
7,unipoint uploads good afternoon when i attach ...,unipoint uploads good afternoon when i attach ...
8,need computer connected to landline can you pl...,need computer connected to landline can you pl...
9,report extraction from trax pr screen please a...,report extraction from trax pr screen please a...


The character number of cleaned full descriptions is 16835105


Unnamed: 0,before,after
0,acceso a internet estudio contable argentina s...,acceso a internet estudio contable argentina s...
1,request to forward email to shared mailbox hi ...,request to forward email to shared mailbox hi ...
2,materila should be removed from gpm the materi...,materila should be removed from gpm the materi...
3,incluir lotes em pedidos faturados boa noite a...,incluir lotes em pedidos faturados boa noite a...
4,request to remove snow subcategory request gen...,request to remove snow subcategory request gen...
5,win njm reimage junwei su need to upgrade my l...,win njm reimage junwei su need to upgrade my l...
6,can you please extend gl code njm to plant njm...,can you please extend gl code njm to plant njm...
7,unipoint uploads good afternoon when i attach ...,unipoint uploads good afternoon when i attach ...
8,need computer connected to landline can you pl...,need computer connected to landline can you pl...
9,report extraction from trax pr screen please a...,report extraction from trax pr screen please a...


The character number of cleaned full descriptions is 16835089


Unnamed: 0,before,after
0,acceso a internet estudio contable argentina s...,acceso a internet estudio contable argentina s...
1,request to forward email to shared mailbox hi ...,request to forward email to shared mailbox hi ...
2,materila should be removed from gpm the materi...,materila should be removed from gpm the materi...
3,incluir lotes em pedidos faturados boa noite a...,incluir lotes em pedidos faturados boa noite a...
4,request to remove snow subcategory request gen...,request to remove snow subcategory request gen...
5,win njm reimage junwei su need to upgrade my l...,win njm reimage junwei su need to upgrade my l...
6,can you please extend gl code njm to plant njm...,can you please extend gl code njm to plant njm...
7,unipoint uploads good afternoon when i attach ...,unipoint uploads good afternoon when i attach ...
8,need computer connected to landline can you pl...,need computer connected to landline can you pl...
9,report extraction from trax pr screen please a...,report extraction from trax pr screen please a...


The character number of cleaned full descriptions is 16835089


Unnamed: 0,before,after
0,acceso a internet estudio contable argentina s...,acceso a internet estudio contable argentina s...
1,request to forward email to shared mailbox hi ...,request to forward email to shared mailbox hi ...
2,materila should be removed from gpm the materi...,materila should be removed from gpm the materi...
3,incluir lotes em pedidos faturados boa noite a...,incluir lotes em pedidos faturados boa noite a...
4,request to remove snow subcategory request gen...,request to remove snow subcategory request gen...
5,win njm reimage junwei su need to upgrade my l...,win njm reimage junwei su need to upgrade my l...
6,can you please extend gl code njm to plant njm...,can you please extend gl code njm to plant njm...
7,unipoint uploads good afternoon when i attach ...,unipoint uploads good afternoon when i attach ...
8,need computer connected to landline can you pl...,need computer connected to landline can you pl...
9,report extraction from trax pr screen please a...,report extraction from trax pr screen please a...


In [14]:
# # translate the non-English sentences
# translate_urls = ["translate.google.com", "translate.google.co.kr",
#                       "translate.google.at", "translate.google.de",
#                       "translate.google.ru", "translate.google.ch",
#                       "translate.google.fr", "translate.google.es"]
# sample_japanese_request = request_texts[10]
# translated_requests = []

# for index, request_raw in enumerate(request_texts):
#     translator = Translator(service_urls=translate_urls)
#     try:
#         request_tanslated = translator.translate(request_raw, dest='en')
#         translated_requests.append(request_tanslated.text)
#     except:
#         translated_requests.append('')
#         print(index)

In [15]:
# compare with Amazon review data
Amazon_review_data = pd.read_csv(AMAZON_REVIEW_PATH, usecols=[16])
Amazon_review_data.head(20)

Unnamed: 0,reviews.text
0,This product so far has not disappointed. My c...
1,great for beginner or experienced person. Boug...
2,Inexpensive tablet for him to use and learn on...
3,I've had my Fire HD 8 two weeks now and I love...
4,I bought this for my grand daughter when she c...
5,This amazon fire 8 inch tablet is the perfect ...
6,"Great for e-reading on the go, nice and light ..."
7,"I gave this as a Christmas gift to my inlaws, ..."
8,Great as a device to read books. I like that i...
9,I love ordering books and reading them with th...


In [17]:
# view stats of the requests
def view_stats (str_array, array_name):
    string_array = copy.deepcopy(str_array)
    
    avg_char = sum([len(text) for text in string_array]) / len(string_array)
    print("the average length of", array_name, "is:", avg_char, "characters")
    avg_word = sum([len(text.split(' ')) for text in string_array]) / len(string_array)
    print("the average word count of", array_name, "is:", avg_word)
    MAX_LENGTH = max([len(text.split(' ')) for text in string_array])
    print("the longest request has", MAX_LENGTH, "words")
    
    count = 0
    longest_texts = []
    longest_text = ''
    for index,text in enumerate(string_array):
        if len(text.split(' ')) > (avg_word):
            count += 1
        if len(text.split(' ')) == MAX_LENGTH:
            longest_text = text
            continue
        if len(text.split(' ')) > (avg_word*10):
            longest_texts.append(text)
            string_array[index] = text[:int(avg_word*10)]
    print(len(longest_texts), '(', 100*len(longest_texts)/len(string_array), "%) strings are longer than (10 * average). They are all cut to", int(avg_word*10))
    print(count, '(', int(100*count/len(string_array)), "%) strings are longer than average")
    print("the longest one is:", longest_text, )
    print('other longest strings are:')
    for text in longest_texts:
        print('length', len(text.split(' ')))
        print(text)
    
    # save the results
    with open(REQUEST_DATA_PATH, 'wb') as target:
        pickle.dump(string_array, target)
    target.close()
    print("the strings are pickled")
    
view_stats(request_full_descriptions_cleaned, 'full descriptions')

# from operator import itemgetter
# index, element = max(enumerate([len(text.split(' ')) for text in full_descriptions_ready]), key=itemgetter(1))
# full_descriptions_ready[index]

the average length of full descriptions is: 216.8296669328456 characters
the average word count of full descriptions is: 39.636317972231524
the longest request has 3454 words
69 ( 0.08886942634141315 %) strings are longer than (10 * average). They are all cut to 396
27865 ( 35 %) strings are longer than average
the longest one is: pds business report distribution list updates received from ejm hello please remove the following people from the pds business report distribution list thanks andi from microsoft outlook mailto ejm sent njm december njm to cavilla andrew subject undeliverable pds business report njm december njm delivery has failed to these recipients or groups ejm mailto ejm a problem occurred during the delivery of this message please try to resend the message later ejm mailto ejm a problem occurred during the delivery of this message please try to resend the message later the following organisation rejected your message ljk diagnostic information for administrators generat

# Text Vectorization (NOT Needed for This Project)

In [18]:
# import the dependencies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [19]:
# vectorize the texts
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(full_descriptions_ready)

NameError: name 'full_descriptions_ready' is not defined

In [20]:
# # a manual approach
# wordCount = defaultdict(int)
# clean_rows = []
# for index, row in data.iterrows():
#     k = row['Subject']
#     if(isinstance(k, basestring)):
#         clean_rows.append(k)
#         for w in k.split():
#             wordCount[w] += 1
#     else:
#         clean_rows.append('')
# punctuation = set(string.punctuation)
# stemmed = []
# for k in clean_rows:
#     words = []
#     r = ''.join([c for c in k.lower() if not c in punctuation])
#     stemmed.append(r)
# data['cleaned'] = stemmed

# punctuation = set(string.punctuation)
# stemmed = []
# for k in clean_rows:
#     words = []
#     r = ''.join([c for c in k.lower() if not c in punctuation])
#     stemmed.append(r)
# data['cleaned'] = stemmed

# cleaned = []
# def remove_non_ascii(words):
#     """Remove non-ASCII characters from list of tokenized words"""
#     new_words = []
#     for word in words:
#         new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#         new_words.append(new_word)
#     return new_words

# def replace_numbers(words):
#     """Replace all interger occurrences in list of tokenized words with textual representation"""
#     p = inflect.engine()
#     new_words = []
#     for word in words:
#         if word.isdigit():
#             new_word = p.number_to_words(word)
#             new_words.append(new_word)
#         else:
#             new_words.append(word)
#     return new_words

# def remove_stopwords(words):
#     """Remove stop words from list of tokenized words"""
#     new_words = []
#     for word in words:
#         if word not in stopwords.words('english'):
#             new_words.append(word)
#     return new_words

# def stem_words(words):
#     """Stem words in list of tokenized words"""
#     stemmer = LancasterStemmer()
#     stems = []
#     for word in words:
#         stem = stemmer.stem(word)
#         stems.append(stem)
#     return stems

# def lemmatize_verbs(words):
#     """Lemmatize verbs in list of tokenized words"""
#     lemmatizer = WordNetLemmatizer()
#     lemmas = []
#     for word in words:
#         lemma = lemmatizer.lemmatize(word, pos='v')
#         lemmas.append(lemma)
#     return lemmas

# def normalize(words):
#     words = remove_non_ascii(words)
#     words = replace_numbers(words)
#     words = remove_stopwords(words)
#     #words = stem_words(words)
#     words =lemmatize_verbs(words)
#     return words

# for item in data['cleaned']:
#     words = word_tokenize(item)
#     words = normalize(words)
#     cleaned.append(words)

# str_clean = []
# for x in cleaned:
#     str1 = ' '.join(x)
#     str_clean.append(str(str1))

In [21]:
# option 2: use a RNN encoder
# code from https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html?fbclid=IwAR0A9VArVExCiqW4N0xpDh42z1Mt8Ov0sDsjNhYnD_RuizJf6i3EioibhI8
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# create a word to number mapping
SOS_token = 0
EOS_token = 1


class Stats:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

eng_stats = Stats('eng')
for text in full_descriptions_ready:
    eng_stats.addSentence(text)

NameError: name 'full_descriptions_ready' is not defined

In [72]:
# prepare the data
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorPairFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    t = torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
    return (t, t)

In [75]:
# define the encoder and decoder RNN
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embeddi
        ng(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_wei
                                 ghts.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# training process
teacher_forcing_ratio = 0.5

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

def trainIters(encoder, decoder, sentences, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorPairFromSentence(random.choice(sentences))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    showPlot(plot_losses)
    
# call the helper functions to train on the actual data
hidden_size = 256
encoder1 = EncoderRNN(eng_stats.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, eng_stats.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, full_descriptions_ready, len(full_descriptions_ready)*0.8, print_every=5000)

In [352]:
MAX_LENGTH

3025

# Vector Clustering

In [45]:
# K-means approach

# define the cluster number
true_k = 7

# set up the model. try 10 random centroid seeds, with 300 iteration for each
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=10)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=7, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [53]:
# examine the results
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:
Cluster 0:
 phone
 desk
 headset
 number
 new
 extension
 avaya
 need
 set
 calls
Cluster 1:
 office
 2013
 365
 microsoft
 upgrade
 2010
 version
 ms
 install
 plus
Cluster 2:
 need
 new
 computer
 request
 account
 link
 printer
 team
 add
 assign
Cluster 3:
 laptop
 new
 need
 loaner
 install
 software
 battery
 installed
 request
 docking
Cluster 4:
 sap
 gui
 pr1
 c11
 access
 need
 installation
 user
 d50
 new
Cluster 5:
 access
 need
 drive
 folder
 link
 provide
 shared
 request
 grant
 read
Cluster 6:
 email
 list
 distribution
 add
 address
 emails
 group
 remove
 change
 need


In [46]:
# make predictions
print("\n")
print("Prediction")
Y = vectorizer.transform(["email not working"])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["can't access the website"])
prediction = model.predict(Y)
print(prediction)



Prediction
[6]
[5]


In [52]:
from collections import Counter
print(Counter(model.labels_))

Counter({2: 49895, 5: 10431, 6: 4923, 3: 4078, 0: 3348, 4: 2963, 1: 2084})


Collecting collections-extended
  Downloading https://files.pythonhosted.org/packages/e6/bb/917ec8f030be3da7a4610f98dc6fc46a2d701433a1e6f82905f0cbd1be3f/collections_extended-1.0.2-py2.py3-none-any.whl
Installing collected packages: collections-extended
Successfully installed collections-extended-1.0.2
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


77805