# 0 Data

In [1]:
def topicData():
    topic = pd.read_csv('../training_data/topic.csv',header=None).values.T[0]
    return topic

def eventData():
    event = pd.read_csv('../training_data/event.csv',header=None).values.T[0]
    return event

def callData():
    call = pd.read_csv('../training_data/call.csv',header=None).values.T[0]
    return call

def waterData():
    water = pd.read_csv('../training_data/water.csv',header=None).values.T[0]
    return water

def visitorData():
    visitor = pd.read_csv('../training_data/visitor.csv',header=None).values.T[0]
    return visitor

# 1 English vocab list

In [2]:
import pandas as pd
import numpy as np
from stemming.porter2 import stem

def getVocabList():
    vocabs = pd.read_csv('../vocab2.csv')
    vocabs['number'] = vocabs.index + 1
    vocabs['word'] = vocabs['0']
    del vocabs['0']
    return vocabs

In [3]:
print(getVocabList().shape)
getVocabList()[0:10]

(223737, 2)


Unnamed: 0,number,word
0,1,a
1,2,aa
2,3,aaa
3,4,aah
4,5,aal
5,6,aalii
6,7,aam
7,8,aani
8,9,aardvark
9,10,aardwolf


# 2 Process text to feature vector

In [4]:
import re
from stemming.porter2 import stem

def processText(email_contents):
    #load vocab
    vocabList = getVocabList()
    
    # ----- Process Email------
    # Lower Case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    strip_all_html = re.compile('[>,<,<*>]') 
    email_contents = re.sub(strip_all_html, '', email_contents)
    strip_all_html2 = re.compile('\s') # \s is equivalent to the class [ \t\n\r\f\v].
    email_contents = re.sub(strip_all_html2, ' ', email_contents)
    
    # Handle Numbers
    # Look for one or more characters between 0-9
    hundle_number = re.compile('\d+')
    email_contents = re.sub(hundle_number, 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    hundle_url = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    email_contents = re.sub(hundle_url, 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    hundle_email = re.compile('[\w\.-]+@[\w\.-]+')
    email_contents = re.sub(hundle_email, 'emailaddr', email_contents)

    # Handle $ sign
    hundle_dollar = re.compile('[$]+')
    email_contents = re.sub(hundle_dollar, 'dollar', email_contents)
    
    # Remove any non alphanumeric characters
    non_cha_alp = re.compile("[^a-zA-Z0-9]+")
    email_contents = re.sub(non_cha_alp, ' ', email_contents)
    
    # ------- Stem words -------    
    words = [stem(word) for word in email_contents.split(" ") if len(word) > 0]
    
    # convert to number in vocab
    word_indices = []
    for w in words:
        match = sum(vocabList['word'] == w)
        if(match>0):
            word_indices.append(vocabList.loc[(vocabList['word'] == w),'number'].astype(int).values[0])
    
    return word_indices

In [5]:
processText("Send someone to repair my bathroom.")

[171879, 178439, 196596, 161447, 118766, 18448]

In [6]:
def textFeatures(word_indices):
    vocabList = getVocabList()
    features = vocabList['number'].astype(int).isin(word_indices) + 0
    return np.array(features)

In [7]:
textFeatures(processText("Send someone to repair my bathroom."))

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# 3 Initial features vector for each menu

In [8]:
def getInitialFeatureVectoc():
    topic = topicData()
    event = eventData()
    call = callData()
    water = waterData()
    visitor = visitorData()
    # Topic
    topic_features = np.zeros(len(getVocabList()))
    for i in range(len(topic)):
        topic_features = topic_features + textFeatures(processText(topic[i]))

    # Event
    event_features = np.zeros(len(getVocabList()))
    for i in range(len(event)):
        event_features = event_features + textFeatures(processText(event[i]))
    
    # Call     
    call_features = np.zeros(len(getVocabList()))
    for i in range(len(call)):
        call_features = call_features + textFeatures(processText(call[i]))
    
    # Water
    water_features = np.zeros(len(getVocabList()))
    for i in range(len(water)):
        water_features = water_features + textFeatures(processText(water[i]))
    
    # Visitor
    visitor_features = np.zeros(len(getVocabList()))
    for i in range(len(visitor)):
        visitor_features = visitor_features + textFeatures(processText(visitor[i]))
    
    topic_features = (topic_features >= 1).astype(int)
    event_features = (event_features >= 1).astype(int)
    call_features = (call_features >= 1).astype(int)
    water_features = (water_features >= 1).astype(int)
    visitor_features = (visitor_features >= 1).astype(int)
    
    return np.array([topic_features, event_features, call_features, water_features, visitor_features])

In [9]:
initial_features = getInitialFeatureVectoc()
initial_features

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 4 Compare similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

def most_similarity(initial_features,text_feature):
    A = np.vstack((initial_features,text_feature))
    A_sparse = sparse.csr_matrix(A)
    similarities = cosine_similarity(A_sparse)
    print('pairwise dense output:\n {}\n'.format(similarities))

    text_vs_initial = similarities[similarities.shape[0]-1,0:similarities.shape[1]-1]
    prob_of_menu = np.max(text_vs_initial)
    menu = np.argmax(text_vs_initial)
    if(prob_of_menu < 0.3):
        return "other"
    else:
        if(menu == 0):
            return "topic"
        elif(menu == 1):
            return "event"
        elif(menu == 2):
            return "call"
        elif(menu == 3):
            return "water"
        else:
            return "visitor"


In [11]:
most_similarity(initial_features,textFeatures(processText("I have to leave the front of the room to call it.")))

pairwise dense output:
 [[ 1.          0.07715167  0.3678836   0.10101525  0.0805823   0.16903085]
 [ 0.07715167  1.          0.06622662  0.          0.17407766  0.        ]
 [ 0.3678836   0.06622662  1.          0.086711    0.13834289  0.2901905 ]
 [ 0.10101525  0.          0.086711    1.          0.11396058  0.11952286]
 [ 0.0805823   0.17407766  0.13834289  0.11396058  1.          0.09534626]
 [ 0.16903085  0.          0.2901905   0.11952286  0.09534626  1.        ]]



'other'

# 5 Optimization

### Vocab List

In [12]:
def createVocabOpt():
    vocabs = pd.read_csv('../vocab2.csv')
    
    topic = topicData()
    event = eventData()
    call = callData()
    water = waterData()
    visitor = visitorData()
    
    # Topic
    topic_features = np.zeros(len(vocabs))
    for i in range(len(topic)):
        topic_features = topic_features + textFeatures(processText(topic[i]))

    # Event
    event_features = np.zeros(len(vocabs))
    for i in range(len(event)):
        event_features = event_features + textFeatures(processText(event[i]))
    
    # Call     
    call_features = np.zeros(len(vocabs))
    for i in range(len(call)):
        call_features = call_features + textFeatures(processText(call[i]))
    
    # Water
    water_features = np.zeros(len(vocabs))
    for i in range(len(water)):
        water_features = water_features + textFeatures(processText(water[i]))
    
    # Visitor
    visitor_features = np.zeros(len(vocabs))
    for i in range(len(visitor)):
        visitor_features = visitor_features + textFeatures(processText(visitor[i]))
    
    all_words = topic_features + event_features + call_features + water_features + visitor_features
    vocabs = vocabs[all_words >= 1]
    vocabs['word'] = vocabs['0']
    del vocabs['0']
    
    vocabs.to_csv('../vocab_opt.csv',index=False)
    
    return "Create Done!"

In [13]:
createVocabOpt()

'Create Done!'

# ------------------------------ END Train -----------------------------

In [14]:
def getVocabListOpt():
    vocabs = pd.read_csv('../vocab_opt.csv')
    vocabs['number'] = vocabs.index + 1
    return vocabs

In [15]:
print(getVocabListOpt().shape)
getVocabListOpt().head()

(51, 2)


Unnamed: 0,word,number
0,a,1
1,activ,2
2,and,3
3,are,4
4,ask,5


In [16]:
import re
from stemming.porter2 import stem

def processTextOpt(email_contents):
    #load vocab
    vocabList = getVocabListOpt()
    
    # ----- Process Email------
    # Lower Case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    strip_all_html = re.compile('[>,<,<*>]') 
    email_contents = re.sub(strip_all_html, '', email_contents)
    strip_all_html2 = re.compile('\s') # \s is equivalent to the class [ \t\n\r\f\v].
    email_contents = re.sub(strip_all_html2, ' ', email_contents)
    
    # Handle Numbers
    # Look for one or more characters between 0-9
    hundle_number = re.compile('\d+')
    email_contents = re.sub(hundle_number, 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    hundle_url = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    email_contents = re.sub(hundle_url, 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    hundle_email = re.compile('[\w\.-]+@[\w\.-]+')
    email_contents = re.sub(hundle_email, 'emailaddr', email_contents)

    # Handle $ sign
    hundle_dollar = re.compile('[$]+')
    email_contents = re.sub(hundle_dollar, 'dollar', email_contents)
    
    # Remove any non alphanumeric characters
    non_cha_alp = re.compile("[^a-zA-Z0-9]+")
    email_contents = re.sub(non_cha_alp, ' ', email_contents)
    
    # ------- Stem words -------    
    words = [stem(word) for word in email_contents.split(" ") if len(word) > 0]
    
    # convert to number in vocab
    word_indices = []
    for w in words:
        match = sum(vocabList['word'] == w)
        if(match>0):
            word_indices.append(vocabList.loc[(vocabList['word'] == w),'number'].astype(int).values[0])
    
    return word_indices

In [17]:
processTextOpt("Send someone to repair my bathroom.")

[36, 39, 45, 35, 27, 6]

In [18]:
def textFeaturesOpt(word_indices):
    vocabList = getVocabListOpt()
    features = vocabList['number'].astype(int).isin(word_indices) + 0
    return np.array(features)

In [19]:
textFeaturesOpt(processTextOpt("Send someone to repair my bathroom."))

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0])

In [20]:
def getInitialFeatureVectorOpt():
    topic = topicData()
    event = eventData()
    call = callData()
    water = waterData()
    visitor = visitorData()
    # Topic
    topic_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(topic)):
        topic_features = topic_features + textFeaturesOpt(processTextOpt(topic[i]))

    # Event
    event_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(event)):
        event_features = event_features + textFeaturesOpt(processTextOpt(event[i]))
    
    # Call     
    call_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(call)):
        call_features = call_features + textFeaturesOpt(processTextOpt(call[i]))
    
    # Water
    water_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(water)):
        water_features = water_features + textFeaturesOpt(processTextOpt(water[i]))
    
    # Visitor
    visitor_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(visitor)):
        visitor_features = visitor_features + textFeaturesOpt(processTextOpt(visitor[i]))
    
    topic_features = (topic_features >= 1).astype(int)
    event_features = (event_features >= 1).astype(int)
    call_features = (call_features >= 1).astype(int)
    water_features = (water_features >= 1).astype(int)
    visitor_features = (visitor_features >= 1).astype(int)
    
    return np.array([topic_features, event_features, call_features, water_features, visitor_features])

In [21]:
initial_features_opt = getInitialFeatureVectorOpt()
initial_features_opt

array([[1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 1, 0, 1, 1],
       [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
        1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
        1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0]])

### Homecare
- ผนังห้องนอนร้าวส่งช่างมาซ่อมหน่อย --> The bedroom is cracked and repaired.

### Event / Activity
- สอบถามกิจกรรมดูหนัง --> View Forum Posts

### ค่าน้ำ
- ค่าน้ำเดือนที่เเล้วเท่าไหร่ --> How much water monthly?

### Call - Negative
- ไฟดับนานมาก ดับมา 2 ชั่วโมงแล้ว มาซ่อมสักที --> The power went out for 2 hours and then came back.

### สอบถามทั่วไป เกี่ยวกับโครงการ
- สอบถามโครงการเซนทริกรัชโยธินครับ --> Ask for the Centric Ratchayothin project.

In [27]:
most_similarity(initial_features_opt,textFeaturesOpt(processTextOpt("Ask for the Centric Ratchayothin project.")))

pairwise dense output:
 [[ 1.          0.07715167  0.3678836   0.10101525  0.0805823   0.11952286]
 [ 0.07715167  1.          0.06622662  0.          0.17407766  0.12909944]
 [ 0.3678836   0.06622662  1.          0.086711    0.13834289  0.20519567]
 [ 0.10101525  0.          0.086711    1.          0.11396058  0.16903085]
 [ 0.0805823   0.17407766  0.13834289  0.11396058  1.          0.67419986]
 [ 0.11952286  0.12909944  0.20519567  0.16903085  0.67419986  1.        ]]



'visitor'