# Aspect Based Sentiment Modeling for Training data

## Reading Community data 

In [1]:
import pandas as pd

df1 = pd.read_csv('discussion_dna_messages.csv')

from bs4 import BeautifulSoup
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = cleantext.replace('&nbsp;', ' ')
    return cleantext

def url_to_string(html):
    soup = BeautifulSoup(str(html), 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    output_text = " ".join(re.split(r'[\n\t]+', soup.get_text()))
    return output_text

df1['parsed_body'] = list(map(lambda x : cleanhtml(url_to_string(x)), list(df1['body'])))
df1.head()

Unnamed: 0,post_type,thread_id,message_id,parent_id,view_href,title,body,accepted_solution,tags,labels,last_edit_time,parsed_body
0,QUESTION,3823005,3823005,,https://community.cisco.com/t5/cisco-digital-n...,DNAC PnP: Converting IOS-XE Software from Inst...,"&lt;P&gt;Hello everybody,&lt;/P&gt;&lt;P&gt;&a...",False,,Cisco DNA Center,2019-03-29T06:31:10.071-07:00,"Hello everybody, we try to provision our switc..."
1,REPLY,3823005,3828873,3823005.0,https://community.cisco.com/t5/cisco-digital-n...,Re: DNAC PnP: Converting IOS-XE Software from ...,"&lt;P&gt;Hello everyone,&lt;/P&gt;&lt;P&gt;&am...",True,,,2019-03-29T06:31:10.071-07:00,"Hello everyone, this behavoir belongs to a new..."
2,REPLY,3823005,3824368,3823005.0,https://community.cisco.com/t5/cisco-digital-n...,Re: DNAC PnP: Converting IOS-XE Software from ...,&lt;P&gt;After some tests it seems to be a spe...,True,,,2019-03-29T06:31:10.071-07:00,After some tests it seems to be a special prob...
3,QUESTION,3811694,3811694,,https://community.cisco.com/t5/cisco-digital-n...,add ip address DNAC,&lt;P&gt;When setup was run for DNAC 10gbit po...,False,,Cisco DNA Center,2019-03-28T11:35:39.339-07:00,When setup was run for DNAC 10gbit port was no...
4,REPLY,3811694,3828414,3811694.0,https://community.cisco.com/t5/cisco-digital-n...,Re: add ip address DNAC,&lt;P&gt;Make sure no spaces when ip address i...,True,,,2019-03-28T11:35:39.339-07:00,Make sure no spaces when ip address is entered...


In [2]:
df1.to_excel("output.xlsx", sheet_name="dna_discussion")

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

df1['pos_tagged_body'] = list(map(lambda x : ne_chunk(pos_tag(word_tokenize(x))), df1['parsed_body']))

df1.head()

Unnamed: 0,post_type,thread_id,message_id,parent_id,view_href,title,body,accepted_solution,tags,labels,last_edit_time,parsed_body,pos_tagged_body
0,QUESTION,3823005,3823005,,https://community.cisco.com/t5/cisco-digital-n...,DNAC PnP: Converting IOS-XE Software from Inst...,"&lt;P&gt;Hello everybody,&lt;/P&gt;&lt;P&gt;&a...",False,,Cisco DNA Center,2019-03-29T06:31:10.071-07:00,"Hello everybody, we try to provision our switc...","[[(Hello, NNP)], (everybody, NN), (,, ,), (we,..."
1,REPLY,3823005,3828873,3823005.0,https://community.cisco.com/t5/cisco-digital-n...,Re: DNAC PnP: Converting IOS-XE Software from ...,"&lt;P&gt;Hello everyone,&lt;/P&gt;&lt;P&gt;&am...",True,,,2019-03-29T06:31:10.071-07:00,"Hello everyone, this behavoir belongs to a new...","[[(Hello, NNP)], (everyone, NN), (,, ,), (this..."
2,REPLY,3823005,3824368,3823005.0,https://community.cisco.com/t5/cisco-digital-n...,Re: DNAC PnP: Converting IOS-XE Software from ...,&lt;P&gt;After some tests it seems to be a spe...,True,,,2019-03-29T06:31:10.071-07:00,After some tests it seems to be a special prob...,"[(After, IN), (some, DT), (tests, NNS), (it, P..."
3,QUESTION,3811694,3811694,,https://community.cisco.com/t5/cisco-digital-n...,add ip address DNAC,&lt;P&gt;When setup was run for DNAC 10gbit po...,False,,Cisco DNA Center,2019-03-28T11:35:39.339-07:00,When setup was run for DNAC 10gbit port was no...,"[(When, WRB), (setup, NN), (was, VBD), (run, V..."
4,REPLY,3811694,3828414,3811694.0,https://community.cisco.com/t5/cisco-digital-n...,Re: add ip address DNAC,&lt;P&gt;Make sure no spaces when ip address i...,True,,,2019-03-28T11:35:39.339-07:00,Make sure no spaces when ip address is entered...,"[[(Make, NNP)], (sure, JJ), (no, DT), (spaces,..."


In [4]:
import spacy
import math
print(spacy.__version__)
nlp = spacy.load('en_core_web_sm')
def prop_n_tagger(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'PROPN':
            tokens.append(token.text)
    return tokens

def nn_tagger(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if token.tag_ == 'NN':
            tokens.append(token.text)
    return tokens
df1['spacy_prop_n_body']  = list(map(prop_n_tagger, df1['parsed_body'])) 
df1['spacy_prop_n_title'] = list(map(prop_n_tagger, df1['title']))
df1['targets'] = df1['spacy_prop_n_body']

df1['spacy_nn_body']  = list(map(nn_tagger, df1['parsed_body'])) 
df1['spacy_nn_title'] = list(map(nn_tagger, df1['title']))
df1['aspects'] = df1['spacy_nn_body']
for index, row in df1.iterrows():
    
    spacy_prop_n = df1['spacy_prop_n_body'][index] + df1['spacy_prop_n_title'][index]
    spacy_nn = df1['spacy_nn_body'][index] + df1['spacy_nn_title'][index]
    label = df1['labels'][index]
    targets = []
    if isinstance(label, float):
        targets = list(set(spacy_prop_n))
    else:
        
        for each_prop_n in spacy_prop_n:

            if each_prop_n in label:
                targets.append(label)
            else:
                targets.append(each_prop_n)
            targets = list(set(targets))
    df1['targets'][index] = targets
    df1['aspects'][index] = spacy_nn
df1.head()

            

2.0.9


ValueError: Cannot create vectors table with dimension 0.
If you're using pre-trained vectors, are the vectors loaded?

In [4]:
df1.loc[df1['post_type'] == 'REPLY', 'title'] = ""
df1=df1[["thread_id","title","parsed_body"]]
df1=df1.groupby("thread_id").apply(lambda x :x.sum()).drop("thread_id",1)
#df1.groupby()
df1.head(5)

Unnamed: 0_level_0,title,parsed_body
thread_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3186932,What’s different in SD-Access from campus fabric?,This link is the best content as far Cisco gav...
3186935,Questions about DNA,"Hello Mohammad, So is it safe to say that in..."
3193060,2960-Plus 24LC-L issue with lanlite image,"Hi there, The feature set lanlite/ lanbase is ..."
3195145,Ask the Expert: Network of the future - Softwa...,Hi Enrique What’s the difference between SDN a...
3225592,Using APIC-EM for bandwidth management,"Hi all, I have a customer interested in impl..."


In [5]:
df2=pd.read_excel("export2_dna_data_from_201708_to_20190327.xlsx",sheet_name="dna_data", index_col="conversation_uid")
df2.head()

Unnamed: 0_level_0,month,post_date,board_uid,board_title,post_type,conversation_title,user_uid,user_login,user_sso_id,category2,url_a,tags,labels,replies,kudos_received,solution_accepted,pg_vws,ev_clks
conversation_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3679256,2018-08-01 12:00:00,2018-08-01,1025,Digital Network Architecture (DNA),forum,Orchestrate DNA Lab Build Up,310855,umahar,umahar,4461-network-infrastructure,https://community.cisco.com/t5/-/-/m-p/3679256,[],"[""Automation,Cisco Digital Network Architectur...",10,0,0,68,207
3682246,2018-08-01 12:00:00,2018-08-06,1025,Digital Network Architecture (DNA),forum,Cisco DNA application system requirements,665911,graeme.antrobus,graeme.antrobus,4461-network-infrastructure,https://community.cisco.com/t5/-/-/m-p/3682246,"[""system requirements,Cisco DNA""]","[""Cisco DNA Center"",""Cisco Digital Network Arc...",16,16,16,253,773
3683177,2018-08-01 12:00:00,2018-08-07,1025,Digital Network Architecture (DNA),forum,c9300 DNA subscriptions,310636,abdulkarim041,abdulkarim041,4461-network-infrastructure,https://community.cisco.com/t5/-/-/m-p/3683177,"[""c9300 DNA""]","[""Cisco DNA Center"",""Cisco Digital Network Arc...",64,16,0,477,1852
3687913,2018-08-01 12:00:00,2018-08-14,1025,Digital Network Architecture (DNA),forum,Cisco DNA Multiple Fabric Domains,312017,069521139,069521139,4461-network-infrastructure,https://community.cisco.com/t5/-/-/m-p/3687913,[],"[""Cisco DNA Center"",""Cisco Digital Network Arc...",12,0,12,142,426
3689146,2018-08-01 12:00:00,2018-08-15,1025,Digital Network Architecture (DNA),forum,CSCvi01378 workaround not working for DNAC,325330,ammahend,ammahend,4461-network-infrastructure,https://community.cisco.com/t5/-/-/m-p/3689146,"[""CSCvi01378""]","[""Cisco DNA Center"",""Cisco Digital Network Arc...",32,0,0,77,243


In [6]:
df2=df2[["kudos_received","solution_accepted", "pg_vws", "ev_clks", "labels"]]
df = pd.merge(df1, df2, left_index=True, right_index=True)
df.head()

Unnamed: 0,title,parsed_body,kudos_received,solution_accepted,pg_vws,ev_clks,labels
3679256,Orchestrate DNA Lab Build Up,How we do this is take a backup prior to any D...,0,0,68,207,"[""Automation,Cisco Digital Network Architectur..."
3682246,Cisco DNA application system requirements,Hello You are talking about Cisco DNA Center ?...,16,16,253,773,"[""Cisco DNA Center"",""Cisco Digital Network Arc..."
3683177,c9300 DNA subscriptions,"Hi, My question is regarding Cisco Catalyst ...",16,0,477,1852,"[""Cisco DNA Center"",""Cisco Digital Network Arc..."
3687913,Cisco DNA Multiple Fabric Domains,Hi I have question about Multiple Fabric Doma...,0,12,142,426,"[""Cisco DNA Center"",""Cisco Digital Network Arc..."
3689146,CSCvi01378 workaround not working for DNAC,"hi there, the description says: During ini...",0,0,77,243,"[""Cisco DNA Center"",""Cisco Digital Network Arc..."


In [8]:
from bert_serving.client import BertClient
bc = BertClient()

In [11]:

df['bert_encoded_title'] = bc.encode(list(df['title'])).tolist()
df['bert_encoded_body'] = bc.encode(list(df['parsed_body'])).tolist()
df.head()


here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Unnamed: 0,title,parsed_body,kudos_received,solution_accepted,pg_vws,ev_clks,labels,bert_encoded_title,bert_encoded_body
3679256,Orchestrate DNA Lab Build Up,How we do this is take a backup prior to any D...,0,0,68,207,"[""Automation,Cisco Digital Network Architectur...","[0.46215784549713135, -0.22273358702659607, -0...","[0.051789797842502594, -0.18707872927188873, 0..."
3682246,Cisco DNA application system requirements,Hello You are talking about Cisco DNA Center ?...,16,16,253,773,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[-0.0262641329318285, 0.08413064479827881, 0.3...","[0.178992360830307, -0.2956380844116211, 0.270..."
3683177,c9300 DNA subscriptions,"Hi, My question is regarding Cisco Catalyst ...",16,0,477,1852,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[-0.0857071578502655, -0.14488472044467926, 0....","[0.07088686525821686, -0.34869205951690674, 0...."
3687913,Cisco DNA Multiple Fabric Domains,Hi I have question about Multiple Fabric Doma...,0,12,142,426,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[0.3810253143310547, -0.39611005783081055, -0....","[0.05355852097272873, -0.5974192023277283, 0.5..."
3689146,CSCvi01378 workaround not working for DNAC,"hi there, the description says: During ini...",0,0,77,243,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[-0.19243969023227692, -0.41035279631614685, 0...","[-0.16016054153442383, -0.15357112884521484, 0..."


In [20]:
import numpy as np
average_kudos_acceptances = np.mean(df['kudos_received'] + df['solution_accepted'])
std_kudos_acceptances = np.std(df['kudos_received'] + df['solution_accepted'])
df.loc[df.kudos_received + df.solution_accepted >= average_kudos_acceptances, 'output_label'] = '1'
df.loc[df.kudos_received + df.solution_accepted < average_kudos_acceptances, 'output_label'] = '0'

df.head()

Unnamed: 0,title,parsed_body,kudos_received,solution_accepted,pg_vws,ev_clks,labels,bert_encoded_title,bert_encoded_body,output_label
3679256,Orchestrate DNA Lab Build Up,How we do this is take a backup prior to any D...,0,0,68,207,"[""Automation,Cisco Digital Network Architectur...","[0.46215784549713135, -0.22273358702659607, -0...","[0.051789797842502594, -0.18707872927188873, 0...",0
3682246,Cisco DNA application system requirements,Hello You are talking about Cisco DNA Center ?...,16,16,253,773,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[-0.0262641329318285, 0.08413064479827881, 0.3...","[0.178992360830307, -0.2956380844116211, 0.270...",1
3683177,c9300 DNA subscriptions,"Hi, My question is regarding Cisco Catalyst ...",16,0,477,1852,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[-0.0857071578502655, -0.14488472044467926, 0....","[0.07088686525821686, -0.34869205951690674, 0....",1
3687913,Cisco DNA Multiple Fabric Domains,Hi I have question about Multiple Fabric Doma...,0,12,142,426,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[0.3810253143310547, -0.39611005783081055, -0....","[0.05355852097272873, -0.5974192023277283, 0.5...",1
3689146,CSCvi01378 workaround not working for DNAC,"hi there, the description says: During ini...",0,0,77,243,"[""Cisco DNA Center"",""Cisco Digital Network Arc...","[-0.19243969023227692, -0.41035279631614685, 0...","[-0.16016054153442383, -0.15357112884521484, 0...",0


In [21]:
df["output_label"].value_counts()

0    78
1    44
Name: output_label, dtype: int64

In [30]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
import tensorflow as tf

def create_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=768, activation='relu'))
    model.add(Dense(output_size, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



In [31]:
from sklearn.cross_validation import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df[['bert_encoded_title', 'bert_encoded_body']], df['output_label'], test_size=0.15, random_state=20) 

In [33]:
combined_bert_encoding = np.mean( (X_train['bert_encoded_title'], X_train['bert_encoded_body']), axis=0)

output_labels = Y_train.values

print(combined_bert_encoding.shape)
print(output_labels.shape)
from keras.callbacks import ModelCheckpoint
model = create_model(input_size=768, output_size=1)
filepath="tabsa_weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
model.fit(combined_bert_encoding, output_labels, validation_split=0.15, epochs = 100, shuffle=True, batch_size=64, callbacks=callbacks_list)

(103, 768)
(103,)
Train on 87 samples, validate on 16 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.62500, saving model to tabsa_weights-improvement-01-0.62.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.62500
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.62500
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.62500
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.62500
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.62500
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.62500
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.62500
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.62500
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.62500
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.62500
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.62500
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.62500
Epoch 14/100

Epoch 00014: val_acc did not improve from 0.


Epoch 00045: val_acc improved from 0.62500 to 0.68750, saving model to tabsa_weights-improvement-45-0.69.hdf5
Epoch 46/100

Epoch 00046: val_acc did not improve from 0.68750
Epoch 47/100

Epoch 00047: val_acc did not improve from 0.68750
Epoch 48/100

Epoch 00048: val_acc did not improve from 0.68750
Epoch 49/100

Epoch 00049: val_acc did not improve from 0.68750
Epoch 50/100

Epoch 00050: val_acc did not improve from 0.68750
Epoch 51/100

Epoch 00051: val_acc did not improve from 0.68750
Epoch 52/100

Epoch 00052: val_acc did not improve from 0.68750
Epoch 53/100

Epoch 00053: val_acc did not improve from 0.68750
Epoch 54/100

Epoch 00054: val_acc did not improve from 0.68750
Epoch 55/100

Epoch 00055: val_acc did not improve from 0.68750
Epoch 56/100

Epoch 00056: val_acc did not improve from 0.68750
Epoch 57/100

Epoch 00057: val_acc did not improve from 0.68750
Epoch 58/100

Epoch 00058: val_acc did not improve from 0.68750
Epoch 59/100

Epoch 00059: val_acc did not improve from 0

<keras.callbacks.History at 0x7fead450e2e8>

In [34]:
combined_bert_encoding_test = np.mean( (X_test['bert_encoded_title'], X_test['bert_encoded_body']), axis=0)
output_labels_test = Y_test.values
results = model.evaluate(x=combined_bert_encoding_test, y=output_labels_test)



In [35]:
metrics = model.metrics_names
results_dict = dict(zip(metrics, results))
print("Results:", results_dict)

Results: {'loss': 1.2244435548782349, 'acc': 0.5263158082962036}
