In [65]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Question Answering Analysis

## Content

### Load datasets

In [90]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
from pylab import rcParams


%matplotlib inline

In [91]:
sns.set(style="ticks")
sns.set_style("whitegrid")
rcParams['figure.dpi'] = 350
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['patch.edgecolor'] = 'white'
rcParams['font.family'] = 'StixGeneral'
rcParams['figure.figsize'] = 15,10
rcParams['font.size'] = 20
rcParams['axes.labelsize'] = 'large'
rcParams['xtick.labelsize'] = 20
rcParams['ytick.labelsize'] = 20

In [92]:
import os
import sys
import gzip
import json
import nltk
from nltk import clean_html

In [93]:
from textblob import TextBlob, Word

In [94]:
import pandas as pd
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


In [95]:
application = getDF('/data/QA/Appliances.json.gz')

In [96]:
application.head()

Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer
0,yes/no,B00004U9JP,"Jun 27, 2014",1403852000.0,I have a 9 year old Badger 1 that needs replac...,?,I replaced my old one with this without a hitch.
1,open-ended,B00004U9JP,"Apr 28, 2014",1398668000.0,model number,,This may help InSinkErator Model BADGER-1: Bad...
2,yes/no,B00004U9JP,"Aug 25, 2014",1408950000.0,can I replace Badger 1 1/3 with a Badger 5 1/2...,?,Plumbing connections will vary with different ...
3,yes/no,B00004U9JP,"Nov 3, 2014",1415002000.0,Does this come with power cord and dishwasher ...,?,It does not come with a power cord. It does co...
4,open-ended,B00004U9JP,"Jun 21, 2014",1403334000.0,loud noise inside when turned on. sounds like ...,,Check if you dropped something inside.Usually ...


# 2 TextBlob

In [97]:
import nltk
from textblob import TextBlob, Word

In [10]:
blob = TextBlob(application['question'][0])

blob

TextBlob("I have a 9 year old Badger 1 that needs replacing, will this Badger 1 install just like the original one?")

In [11]:
blob.noun_phrases
#??????

WordList(['badger', 'badger'])

In [12]:
blob.correct()
# 改错了

TextBlob("I have a 9 year old Danger 1 that needs replacing, will this Danger 1 install just like the original one?")

## Sentiment Analysis with TextBlob

#### #

In [13]:
from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer
from textblob import Blobber
tba = Blobber(analyzer=NaiveBayesAnalyzer())

In [14]:
print tba(application['question'][0]).sentiment

Sentiment(classification='neg', p_pos=0.2430232741273017, p_neg=0.7569767258726988)


In [15]:
blob.sentences 

[Sentence("I have a 9 year old Badger 1 that needs replacing, will this Badger 1 install just like the original one?")]

In [16]:
blob = TextBlob(application['question'][0], analyzer=PatternAnalyzer())
blob.sentences 

[Sentence("I have a 9 year old Badger 1 that needs replacing, will this Badger 1 install just like the original one?")]

In [17]:
#Polarity ranges from -1 to 1 (1 = positive sentiment). 
#Subjectivity how much opinion is expressed within a given sentence: 0 = objective 1 = subjective
blob.sentiment

Sentiment(polarity=0.2375, subjectivity=0.475)

#### #.

## 2.1 Analyze the questions

### 2.1.1 questionType

In [71]:
q_application = application[['questionType','asin','question']]

In [72]:
q_application.head()

Unnamed: 0,questionType,asin,question
0,yes/no,B00004U9JP,I have a 9 year old Badger 1 that needs replac...
1,open-ended,B00004U9JP,model number
2,yes/no,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...
3,yes/no,B00004U9JP,Does this come with power cord and dishwasher ...
4,open-ended,B00004U9JP,loud noise inside when turned on. sounds like ...


In [73]:
mapping = {"yes/no": 1, "open-ended": 0}
q_application['questionType'] = q_application['questionType'].replace(mapping).astype(np.int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
q_application.head()

Unnamed: 0,questionType,asin,question
0,1,B00004U9JP,I have a 9 year old Badger 1 that needs replac...
1,0,B00004U9JP,model number
2,1,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...
3,1,B00004U9JP,Does this come with power cord and dishwasher ...
4,0,B00004U9JP,loud noise inside when turned on. sounds like ...


#### train queationType classify

In [22]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

In [23]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.externals import joblib



In [24]:
X_train = q_application.question.values
y_train = q_application.questionType.values
print X_train.shape
print y_train.shape

(9011,)
(9011,)


In [25]:
vect = CountVectorizer()
X_train, X_test, y_train, y_test = train_test_split(q_application.question.values, q_application.questionType.values, random_state=1, train_size=0.75)
train_dtm = vect.fit_transform(X_train)
print 'Features: ', train_dtm.shape[1]
test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

Features:  7582
Accuracy:  0.7585441633377719


In [26]:
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
print train_dtm.shape

(6758, 7582)


In [27]:
transformer = TfidfTransformer()
train_tfidf = transformer.fit_transform(train_dtm)

In [28]:
nb = MultinomialNB().fit(train_tfidf, y_train)

In [29]:
# save the training model & vector
joblib.dump(nb, '/data/QA/model.pkl')
joblib.dump(vect, '/data/QA/count_vect')

['/data/QA/count_vect']

In [30]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [31]:
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [32]:
lemmatizer = nltk.WordNetLemmatizer()

In [33]:
def spit_into_lemmas(text):
    text = unicode(text, 'utf-8').lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

In [34]:
vect = CountVectorizer(analyzer=spit_into_lemmas)
train_dtm = vect.fit_transform(X_train)
print 'Features: ', train_dtm.shape[1]
test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

Features:  7509
Accuracy:  0.7612072791833111


#### do a new prediction

In [35]:
clf = joblib.load('/data/QA/model.pkl')

In [36]:
count_vect = joblib.load('/data/QA/count_vect')

In [37]:
# X_train, X_test, y_train, y_test = train_test_split(q_application.question.values, q_application.questionType.values, random_state=1, train_size=0.75)

In [38]:
#testing_data = [q_application['question'][0]]
#testing_data = ['loud noise inside when turned on. sounds like']
#testing_data = ['can I replace Badger 1 1/3 with a Badger?']
testing_data = ['can I ask a question?']



In [39]:
tfidf_transformer = TfidfTransformer()

In [40]:
X_new_counts = count_vect.transform(testing_data)

In [41]:
X_new_tfidf = tfidf_transformer.fit_transform(X_new_counts)

In [42]:
prediction = clf.predict(X_new_tfidf)
print prediction

[0]


## train question-answer model

In [66]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

In [67]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.externals import joblib

In [70]:
q_a = application[['questionType','asin','question','answer']]


Unnamed: 0,questionType,asin,question,answer
0,yes/no,B00004U9JP,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.
1,open-ended,B00004U9JP,model number,This may help InSinkErator Model BADGER-1: Bad...
2,yes/no,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...
3,yes/no,B00004U9JP,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...
4,open-ended,B00004U9JP,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...


In [74]:
mapping = {"yes/no": 1, "open-ended": 0}
q_a['questionType'] = q_a['questionType'].replace(mapping).astype(np.int)
q_a.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,questionType,asin,question,answer
0,1,B00004U9JP,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.
1,0,B00004U9JP,model number,This may help InSinkErator Model BADGER-1: Bad...
2,1,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...
3,1,B00004U9JP,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...
4,0,B00004U9JP,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...


In [76]:
X_train = q_a.question.values
y_train = q_a.answer.values
print X_train.shape
print y_train.shape

(9011,)
(9011,)


In [80]:
vect = CountVectorizer(ngram_range=(2,2),stop_words='english', max_features=10000)
X_train, X_test, y_train, y_test = train_test_split(q_a.question.values, q_a.answer.values, random_state=1, train_size=0.75)
train_dtm = vect.fit_transform(X_train)
print 'Features: ', train_dtm.shape[1]
test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

Features:  10000
Accuracy:  0.029738126941855306


### 2.1.2 question similarity

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
test = raw_input("Hi, how can I help you?\n")

In [57]:
test

'how to use a washer?'

In [58]:
data = q_application.question.values
data=np.append(data,test)

In [59]:
vec = TfidfVectorizer()
X = vec.fit_transform(data)

In [60]:
S = cosine_similarity(X)

In [61]:
def select_k_closest(target, k, distance_matrix, data):
   
    res = list(np.argsort(distance_matrix[target])[-k-1:-1])
    res.reverse()   
    result = [ (S[target,i], data[i]) for i in res]
    
    return result

In [62]:
k = 4
target = len(data)-1
question = select_k_closest(target, k, S, data)
question

[(0.4531438669924055, 'how loud is this washer?'),
 (0.4530932875169346, 'how to remove front of washer to get to pump'),
 (0.44047534165057606, 'how much detergent do I use in the Danby 1.7 washer?'),
 (0.3896558331924539, 'How do you use the disposal')]

In [63]:
def getAnswer(question):    ##question is the result of select_k_closest
    answer_set = []
    for i in range(len(question)):
        answer_set.append(application[application['question']==question[i][1]])
    return answer_set[0]['answer']   ##return the first answer
#    return answer_set

In [64]:
getAnswer(question)

7935    compared to maytag I can barely tell the dishe...
Name: answer, dtype: object

In [83]:
import nltk

# Word2Vec

In [84]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/1b/93/0978a08e622dda7620570450529b8e27ff5fbac41e55747e61f3e420e143/gensim-3.6.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.0MB)
[K    100% |████████████████████████████████| 24.0MB 505kB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/87/06/b9e2f15abbffa4103c6b2bdaab89d6753240c4c1b25e20b6866a29d7fd26/boto3-1.9.20-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 229kB/s 

In [107]:
from gensim.models import Word2Vec

In [108]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [130]:
stop_words = nltk.corpus.stopwords.words('english') + [
    '.',
    ',',
    '--',
    '\'s',
    '?',
    ')',
    '(',
    ':',
    '\'',
    '\'re',
    '"',
    '-',
    '}',
    '{',
    u'—',
    ]

In [100]:
from nltk.tokenize import RegexpTokenizer
#w means tokens are made of only alphanumeric characters where + indicates that they comprise of one or more of such characters
tokenizer = RegexpTokenizer('\w+')

## vectorize with answers

In [124]:
# delete number and symbol
import string
def CleanLines(text):
    cleanLine = []
    identify = string.maketrans('', '')
    delEStr = string.punctuation +string.digits
    
    for i in text:       
        lines = i.translate(identify,delEStr)
        cleanLine.append(lines) 
    return cleanLine
    

In [125]:
t_answer = CleanLines(application['answer'])

In [127]:
#add token in answer question
def getToken(text):
    token_as = []
    for i in text:
        tokens = tokenizer.tokenize(i)
        token_as.append(tokens)
    #return token_as
    return token_as

In [128]:
t_answer = getToken(t_answer)

In [134]:
#delete stop words and make lowercase ?? not lower
def cleanword(text):
    stop_w = []
    for e in text:
        content = [w.lower() for w in e if w.lower() not in stopwords]
        stop_w.append(content)
    return stop_w

In [135]:
t_answer = addstopword(t_answer)

https://blog.csdn.net/zl_best/article/details/53433072

In [137]:
model = Word2Vec(t_answer, sg=1, size=100,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=5)

In [138]:
model.save('/data/QA/application_a.bin')

In [147]:
model = Word2Vec.load('/data/QA/application_a.bin')

In [153]:
model['computer'] 

  """Entry point for launching an IPython kernel.


array([-1.32331654e-01,  1.16727382e-01, -1.76607087e-01, -8.81521255e-02,
        1.65099557e-02, -7.12251365e-02, -3.92185338e-02, -1.38170570e-01,
       -6.64265640e-03,  3.13875824e-02,  1.07295485e-03,  5.05944528e-02,
        4.71987426e-02,  1.41505282e-02,  5.51960170e-02, -3.76999169e-03,
       -1.19049743e-01, -3.12420819e-02,  1.31650373e-01, -1.52442437e-02,
       -2.02968284e-01,  4.22006920e-02,  1.31987617e-03, -1.25849703e-02,
       -8.70161802e-02, -1.15419127e-01, -1.31180450e-01,  9.47831273e-02,
       -7.50080869e-02, -1.42581016e-01,  1.20463192e-01, -3.33953649e-02,
       -6.41307561e-04,  6.08950220e-02, -2.17302646e-02,  1.04295295e-02,
       -1.68902770e-01,  2.47116983e-02,  2.18189582e-02,  4.27780375e-02,
       -2.49504503e-02,  1.05322368e-01, -9.45535675e-03,  2.23006885e-02,
        1.50907412e-01, -4.63203676e-02,  9.93640721e-02, -6.51400685e-02,
       -8.89824852e-02, -4.44200635e-03, -8.46738890e-02,  5.49813434e-02,
        7.01826066e-03,  

In [154]:
model.similarity('computer', 'washer')

  """Entry point for launching an IPython kernel.


0.54940695

## vectorize questions

In [141]:
t_question = CleanLines(application['question'])

In [142]:
t_question = getToken(t_question)

In [143]:
t_question = addstopword(t_question)

In [144]:
model = Word2Vec(t_question, sg=1, size=100,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=5)

In [145]:
model.save('/data/QA/application_q.bin')