In [376]:
import pandas as pd
import csv
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics.pairwise import cosine_similarity

from Text_Preprocessing import preprocess_text

## Train

We start with training a classification model by using the train_features.tsv file

In [331]:
training_features_path = "Data/MSDialog/Intent/train_features.tsv"

In [332]:
df = pd.read_csv(training_features_path,sep='\t')
x_train = np.zeros(24)
y_train = []
for i, row in enumerate(df.iterrows()):
    label = tuple(row[0].split("_"))
    features = np.array((row[1][0]).split(" "), dtype='float64')
    
    x_train = np.vstack((x_train, features))
    y_train.append(label)
    
    
x_train = x_train[1:][:]

Here we use an Sklearn method, that can encode our classes as Multihot vectors, which is necessary to handle the multiple labels each message can have

In [334]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
print(list(mlb.classes_))

['CQ', 'FD', 'FQ', 'GG', 'IR', 'JK', 'NF', 'O', 'OQ', 'PA', 'PF', 'RQ']


Here one should be able to replace RandomForestClassifier with another type of classifier. One can also adjust the parameter n_estimators

In [221]:
forest = RandomForestClassifier(n_estimators=100, random_state=1)
RandomForest = MultiOutputClassifier(forest, n_jobs=-1)
RandomForest.fit(x_train, y_train)

MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
           n_jobs=-1)

## Test

In this step, we evaluate our trained classifier with IoU

In [344]:
test_features_path = "Data/MSDialog/Intent/test_features.tsv"

In [345]:
df = pd.read_csv(test_features_path,sep='\t')
x_test = np.zeros(24)
y_test = []
for i, row in enumerate(df.iterrows()):
    label = tuple(row[0].split("_"))
    features = np.array((row[1][0]).split(" "), dtype='float64')
    
    x_test = np.vstack((x_test, features))
    y_test.append(label)
    
    
x_test = x_test[1:][:]

In [346]:
y_test = mlb.transform(y_test)
print(list(mlb.classes_))

['CQ', 'FD', 'FQ', 'GG', 'IR', 'JK', 'NF', 'O', 'OQ', 'PA', 'PF', 'RQ']


In [347]:
x_test_pred = RandomForest.predict(x_test)

In [348]:
N, D = np.shape(y_test)
IoU = []
for i in range(N):
    true = y_test[i][:]
    pred = x_test_pred[i][:]
    union = np.sum(np.logical_or(true, pred))
    intersection = np.sum(np.logical_and(true, pred))
    IoU.append(np.sum(intersection) / np.sum(union))

In [None]:
sum(IoU) / len(IoU)

**Result**  
Roughly 60% accuracy for Random Forest, which is also what the authors of the MSDialog papers presented. 

## Experiment - Disentanglement

In [None]:
synthetic_dataset_path = "Data/MSDialog/Synthetic/Synthetic.tsv"

We start with printing the stream of messages as it is. The Conversations are about:
* Emojis in Skype
* Transitions in PowerPoint
* Recovering lost letter in Email
* Changing neighborhood in Bing (whatever that means)

In [352]:
df = pd.read_csv(synthetic_dataset_path,sep='\t')
labels = {}
utterance = {}
users = {}
for i, row in enumerate(df.iterrows()):
    users[i] = row[0]
    labels[i] = (row[1][0]).split("_")
    utterance[i] = (row[1][1]).split("__eou__")[0]
    print(users[i] + ": " + utterance[i] + "\n")

Richard: how do i increase the size of an emoji in the conversation \? 

Steven: hello richard , welcome to skype community forum it is my regret to inform you that there s no way for us to increase the size of emoji on skype please do not hesitate to reply to this thread if you need further help 

Richard: thank you it would seem that the emojis are bigger as long as you send them one by one 

Tyler: i am missing the morph transition from my 2016 powerpoint 

Benjamin: i was writing a letter in e mail and did not save it when i transfered to another application is it still available and how do i find it \? 

Suzanne: hi tyler , the morph transition feature in powerpoint is only available if you have an office 365 subscription if you are an office 365 subscriber , make sure you have the latest version of office for more details use the morph transition in powerpoint regards , yogasuzanne 

Veronica: if you did n't save it while typing , it is probably gone for good windows 7 did not co

In [353]:
filtered = []
for i, message in enumerate(list(utterance.values())):
    filtered.append(" ".join(preprocess_text(message)))
    

First we take a little peak into the IDF, and how the top and bottom words score. 

In [371]:
#instantiate CountVectorizer()
cv=CountVectorizer()

# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(filtered)


tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
make,2.178655
bing,2.178655
yogasuzanne,2.466337
save,2.466337
reply,2.466337
...,...
forward,2.871802
forum,2.871802
fixed,2.871802
feedback,2.871802


In [372]:
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(filtered)


In [373]:
def cosine_sim(vec1, vec2):
    return np.dot(vec1.T, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [374]:
conversations = {}
max_similarity = {"id": None, "value": 0}
for i, message in enumerate(utterance.keys()):
    add_mention = False
    if len(conversations.keys()) == 0:
        conversations[i] = {i: utterance[i]}
    else:
        for c_id in conversations.keys():
            user_ids = list(conversations[c_id].keys())
            user_names = [users[i] for i in user_ids 
                          if users[i].lower() in utterance[message].lower()]
            if len(user_names) > 0:
                add_mention = True
                break
            
            max_similarity = {"id": None, "value": 0}
            for m_id in conversations[c_id].keys():
                current_message = tfidf_vectorizer_vectors[i].T.todense()
                old_message = tfidf_vectorizer_vectors[m_id].T.todense()
                cos_sim = cosine_sim(current_message, old_message)          
                if cos_sim > max_similarity["value"]:
                    max_similarity["value"] = cos_sim
                    max_similarity["id"] = c_id
        if add_mention: 
            conversations[c_id][i] = utterance[i]
        elif max_similarity["id"] != None and max_similarity["value"] > 0.05:
            conversations[max_similarity["id"]][i] = utterance[i]
        else:
            conversations[i] = {i: utterance[i]}

Now we are ready to print the disentangled conversations. We note, that there is one mistake (2). The highest TF-IDF scoring word in this message is 'one'. This word should probably be in the set of stopwords, but is not currently. 

In [361]:
for c_id in conversations.keys():
    for m_id in conversations[c_id].keys():
        print(str(m_id) + " - " + conversations[c_id][m_id])
    print("\n ---- \n")

0 - how do i increase the size of an emoji in the conversation \? 
1 - hello richard , welcome to skype community forum it is my regret to inform you that there s no way for us to increase the size of emoji on skype please do not hesitate to reply to this thread if you need further help 

 ---- 

2 - thank you it would seem that the emojis are bigger as long as you send them one by one 

 ---- 

3 - i am missing the morph transition from my 2016 powerpoint 
5 - hi tyler , the morph transition feature in powerpoint is only available if you have an office 365 subscription if you are an office 365 subscriber , make sure you have the latest version of office for more details use the morph transition in powerpoint regards , yogasuzanne 
7 - hi tyler , have my replies answered your question \? regards , yogasuzanne 

 ---- 

4 - i was writing a letter in e mail and did not save it when i transfered to another application is it still available and how do i find it \? 
6 - if you did n't save 

## Experiment - Classification

In [362]:
synthetic_features_path = "Data/MSDialog/Synthetic/Synthetic_Features.tsv"

In [363]:
df = pd.read_csv(synthetic_features_path,sep='\t')
x_synthetic = np.zeros(24)
y_synthetic = []
for i, row in enumerate(df.iterrows()):
    label = tuple(row[0].split("_"))
    features = np.array((row[1][0]).split(" "), dtype='float64')
    
    x_synthetic = np.vstack((x_synthetic, features))
    y_synthetic.append(label)
    
    
x_synthetic = x_synthetic[1:][:]

In [364]:
x_synthetic_pred = RandomForest.predict(x_synthetic)

In [365]:
for c_id in conversations.keys():
    for m_id in conversations[c_id].keys():
        class_pred = np.where(x_synthetic_pred[m_id] > 0)[0]
        class_pred = list(mlb.classes_)[class_pred[0]]
        print(str(m_id) + " - " + str(class_pred) + " - " + conversations[c_id][m_id])
    print("\n ---- \n")

0 - OQ - how do i increase the size of an emoji in the conversation \? 
1 - PA - hello richard , welcome to skype community forum it is my regret to inform you that there s no way for us to increase the size of emoji on skype please do not hesitate to reply to this thread if you need further help 

 ---- 

2 - FD - thank you it would seem that the emojis are bigger as long as you send them one by one 

 ---- 

3 - OQ - i am missing the morph transition from my 2016 powerpoint 
5 - PA - hi tyler , the morph transition feature in powerpoint is only available if you have an office 365 subscription if you are an office 365 subscriber , make sure you have the latest version of office for more details use the morph transition in powerpoint regards , yogasuzanne 
7 - IR - hi tyler , have my replies answered your question \? regards , yogasuzanne 

 ---- 

4 - OQ - i was writing a letter in e mail and did not save it when i transfered to another application is it still available and how do i f

In [366]:
y_synthetic = mlb.transform(y_synthetic)

In [367]:
N, D = np.shape(x_synthetic_pred)
IoU = []
for i in range(N):
    true = y_synthetic[i][:]
    pred = x_synthetic_pred[i][:]
    union = np.sum(np.logical_or(true, pred))
    intersection = np.sum(np.logical_and(true, pred))
    IoU.append(np.sum(intersection) / np.sum(union))

In [368]:
sum(IoU) / len(IoU)

0.8333333333333334

Good accuracy, but also an easy setting. Note, that our predictor only predicts one class. Weird... 

## Experiment - Topics
Now we mine topics. This can be done multiple ways. Here we use the most significant word from each message as part of the overall topic for the conversation. 

In [370]:
for c_id in conversations.keys():
    topics = []
    for m_id in conversations[c_id].keys():
        max_elem = np.argmax(tfidf_vectorizer_vectors[m_id].todense())
        topic = list(tfidf_vectorizer.vocabulary_.keys())[list(tfidf_vectorizer.vocabulary_.values()).index(max_elem)] # Prints george
        topics.append(topic)
        print(" - " + conversations[c_id][m_id])
    print(topics)
    print("--\n")

 - how do i increase the size of an emoji in the conversation \? 
 - hello richard , welcome to skype community forum it is my regret to inform you that there s no way for us to increase the size of emoji on skype please do not hesitate to reply to this thread if you need further help 
['conversation', 'skype']
--

 - thank you it would seem that the emojis are bigger as long as you send them one by one 
['one']
--

 - i am missing the morph transition from my 2016 powerpoint 
 - hi tyler , the morph transition feature in powerpoint is only available if you have an office 365 subscription if you are an office 365 subscriber , make sure you have the latest version of office for more details use the morph transition in powerpoint regards , yogasuzanne 
 - hi tyler , have my replies answered your question \? regards , yogasuzanne 
['2016', 'office', 'answered']
--

 - i was writing a letter in e mail and did not save it when i transfered to another application is it still available and ho