In [65]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Question Answering Analysis

## Content

### Load datasets

In [229]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
from pylab import rcParams


%matplotlib inline

In [230]:
sns.set(style="ticks")
sns.set_style("whitegrid")
rcParams['figure.dpi'] = 350
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['patch.edgecolor'] = 'white'
rcParams['font.family'] = 'StixGeneral'
rcParams['figure.figsize'] = 15,10
rcParams['font.size'] = 20
rcParams['axes.labelsize'] = 'large'
rcParams['xtick.labelsize'] = 20
rcParams['ytick.labelsize'] = 20

In [231]:
import os
import sys
import gzip
import json
import nltk
from nltk import clean_html

In [232]:
from textblob import TextBlob, Word

In [233]:
import pandas as pd
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


In [234]:
application = getDF('/data/QA/Appliances.json.gz')

In [15]:
application.head()

Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer
0,yes/no,B00004U9JP,"Jun 27, 2014",1403852000.0,I have a 9 year old Badger 1 that needs replac...,?,I replaced my old one with this without a hitch.
1,open-ended,B00004U9JP,"Apr 28, 2014",1398668000.0,model number,,This may help InSinkErator Model BADGER-1: Bad...
2,yes/no,B00004U9JP,"Aug 25, 2014",1408950000.0,can I replace Badger 1 1/3 with a Badger 5 1/2...,?,Plumbing connections will vary with different ...
3,yes/no,B00004U9JP,"Nov 3, 2014",1415002000.0,Does this come with power cord and dishwasher ...,?,It does not come with a power cord. It does co...
4,open-ended,B00004U9JP,"Jun 21, 2014",1403334000.0,loud noise inside when turned on. sounds like ...,,Check if you dropped something inside.Usually ...


# 2 TextBlob

In [97]:
import nltk
from textblob import TextBlob, Word

In [10]:
blob = TextBlob(application['question'][0])

blob

TextBlob("I have a 9 year old Badger 1 that needs replacing, will this Badger 1 install just like the original one?")

In [11]:
blob.noun_phrases
#??????

WordList(['badger', 'badger'])

In [12]:
blob.correct()
# 改错了

TextBlob("I have a 9 year old Danger 1 that needs replacing, will this Danger 1 install just like the original one?")

## Sentiment Analysis with TextBlob

#### #

In [235]:
from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer
from textblob import Blobber
tba = Blobber(analyzer=NaiveBayesAnalyzer())

In [236]:
print tba(application['question'][0]).sentiment

Sentiment(classification='neg', p_pos=0.2430232741273017, p_neg=0.7569767258726988)


In [15]:
blob.sentences 

[Sentence("I have a 9 year old Badger 1 that needs replacing, will this Badger 1 install just like the original one?")]

In [16]:
blob = TextBlob(application['question'][0], analyzer=PatternAnalyzer())
blob.sentences 

[Sentence("I have a 9 year old Badger 1 that needs replacing, will this Badger 1 install just like the original one?")]

In [17]:
#Polarity ranges from -1 to 1 (1 = positive sentiment). 
#Subjectivity how much opinion is expressed within a given sentence: 0 = objective 1 = subjective
blob.sentiment

Sentiment(polarity=0.2375, subjectivity=0.475)

#### #.

## 2.1 Analyze the questions

### 2.1.1 questionType

In [226]:
q_application = application[['questionType','asin','question']]

NameError: name 'application' is not defined

In [72]:
q_application.head()

Unnamed: 0,questionType,asin,question
0,yes/no,B00004U9JP,I have a 9 year old Badger 1 that needs replac...
1,open-ended,B00004U9JP,model number
2,yes/no,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...
3,yes/no,B00004U9JP,Does this come with power cord and dishwasher ...
4,open-ended,B00004U9JP,loud noise inside when turned on. sounds like ...


In [73]:
mapping = {"yes/no": 1, "open-ended": 0}
q_application['questionType'] = q_application['questionType'].replace(mapping).astype(np.int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
q_application.head()

Unnamed: 0,questionType,asin,question
0,1,B00004U9JP,I have a 9 year old Badger 1 that needs replac...
1,0,B00004U9JP,model number
2,1,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...
3,1,B00004U9JP,Does this come with power cord and dishwasher ...
4,0,B00004U9JP,loud noise inside when turned on. sounds like ...


#### train queationType classify

In [244]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

In [243]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.externals import joblib

In [24]:
X_train = q_application.question.values
y_train = q_application.questionType.values
print X_train.shape
print y_train.shape

(9011,)
(9011,)


In [25]:
vect = CountVectorizer()
X_train, X_test, y_train, y_test = train_test_split(q_application.question.values, q_application.questionType.values, random_state=1, train_size=0.75)
train_dtm = vect.fit_transform(X_train)
print 'Features: ', train_dtm.shape[1]
test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

Features:  7582
Accuracy:  0.7585441633377719


In [26]:
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
print train_dtm.shape

(6758, 7582)


In [27]:
transformer = TfidfTransformer()
train_tfidf = transformer.fit_transform(train_dtm)

In [28]:
nb = MultinomialNB().fit(train_tfidf, y_train)

In [29]:
# save the training model & vector
joblib.dump(nb, '/data/QA/model.pkl')
joblib.dump(vect, '/data/QA/count_vect')

['/data/QA/count_vect']

In [30]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [31]:
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [32]:
lemmatizer = nltk.WordNetLemmatizer()

In [33]:
def spit_into_lemmas(text):
    text = unicode(text, 'utf-8').lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

In [34]:
vect = CountVectorizer(analyzer=spit_into_lemmas)
train_dtm = vect.fit_transform(X_train)
print 'Features: ', train_dtm.shape[1]
test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

Features:  7509
Accuracy:  0.7612072791833111


#### do a new prediction

In [35]:
clf = joblib.load('/data/QA/model.pkl')

In [36]:
count_vect = joblib.load('/data/QA/count_vect')

In [37]:
# X_train, X_test, y_train, y_test = train_test_split(q_application.question.values, q_application.questionType.values, random_state=1, train_size=0.75)

In [38]:
#testing_data = [q_application['question'][0]]
#testing_data = ['loud noise inside when turned on. sounds like']
#testing_data = ['can I replace Badger 1 1/3 with a Badger?']
testing_data = ['can I ask a question?']



In [39]:
tfidf_transformer = TfidfTransformer()

In [40]:
X_new_counts = count_vect.transform(testing_data)

In [41]:
X_new_tfidf = tfidf_transformer.fit_transform(X_new_counts)

In [42]:
prediction = clf.predict(X_new_tfidf)
print prediction

[0]


## train question-answer model

In [241]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

In [242]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.externals import joblib



In [70]:
q_a = application[['questionType','asin','question','answer']]


Unnamed: 0,questionType,asin,question,answer
0,yes/no,B00004U9JP,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.
1,open-ended,B00004U9JP,model number,This may help InSinkErator Model BADGER-1: Bad...
2,yes/no,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...
3,yes/no,B00004U9JP,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...
4,open-ended,B00004U9JP,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...


In [74]:
mapping = {"yes/no": 1, "open-ended": 0}
q_a['questionType'] = q_a['questionType'].replace(mapping).astype(np.int)
q_a.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,questionType,asin,question,answer
0,1,B00004U9JP,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.
1,0,B00004U9JP,model number,This may help InSinkErator Model BADGER-1: Bad...
2,1,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...
3,1,B00004U9JP,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...
4,0,B00004U9JP,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...


In [76]:
X_train = q_a.question.values
y_train = q_a.answer.values
print X_train.shape
print y_train.shape

(9011,)
(9011,)


In [80]:
vect = CountVectorizer(ngram_range=(2,2),stop_words='english', max_features=10000)
X_train, X_test, y_train, y_test = train_test_split(q_a.question.values, q_a.answer.values, random_state=1, train_size=0.75)
train_dtm = vect.fit_transform(X_train)
print 'Features: ', train_dtm.shape[1]
test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

Features:  10000
Accuracy:  0.029738126941855306


### 2.1.2 question similarity

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
test = raw_input("Hi, how can I help you?\n")

In [2]:
test

'how to use the washer?'

In [58]:
data = q_application.question.values
data=np.append(data,test)

In [59]:
vec = TfidfVectorizer()
X = vec.fit_transform(data)

In [60]:
S = cosine_similarity(X)

In [61]:
def select_k_closest(target, k, distance_matrix, data):
   
    res = list(np.argsort(distance_matrix[target])[-k-1:-1])
    res.reverse()   
    result = [ (S[target,i], data[i]) for i in res]
    
    return result

In [62]:
k = 4
target = len(data)-1
question = select_k_closest(target, k, S, data)
question

[(0.4531438669924055, 'how loud is this washer?'),
 (0.4530932875169346, 'how to remove front of washer to get to pump'),
 (0.44047534165057606, 'how much detergent do I use in the Danby 1.7 washer?'),
 (0.3896558331924539, 'How do you use the disposal')]

In [63]:
def getAnswer(question):    ##question is the result of select_k_closest
    answer_set = []
    for i in range(len(question)):
        answer_set.append(application[application['question']==question[i][1]])
    return answer_set[0]['answer']   ##return the first answer
#    return answer_set

In [64]:
getAnswer(question)

7935    compared to maytag I can barely tell the dishe...
Name: answer, dtype: object

In [83]:
import nltk

# Word2Vec

In [84]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/1b/93/0978a08e622dda7620570450529b8e27ff5fbac41e55747e61f3e420e143/gensim-3.6.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.0MB)
[K    100% |████████████████████████████████| 24.0MB 505kB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/87/06/b9e2f15abbffa4103c6b2bdaab89d6753240c4c1b25e20b6866a29d7fd26/boto3-1.9.20-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 229kB/s 

In [127]:
from gensim.models import Word2Vec

In [17]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [18]:
stop_words = nltk.corpus.stopwords.words('english') + [
    '.',
    ',',
    '--',
    '\'s',
    '?',
    ')',
    '(',
    ':',
    '\'',
    '\'re',
    '"',
    '-',
    '}',
    '{',
    u'—',
    ]

英文预处理

https://blog.csdn.net/caicai1617/article/details/21690911

## vectorize with answers

In [246]:
from nltk.tokenize import RegexpTokenizer
#w means tokens are made of only alphanumeric characters where + indicates that they comprise of one or more of such characters
tokenizer = RegexpTokenizer('\w+')

In [237]:
# delete number and symbol
import string
def CleanLines(text):
    cleanLine = []
    identify = string.maketrans('', '')
    delEStr = string.punctuation +string.digits
    
    for i in text:       
        lines = i.translate(identify,delEStr)
        cleanLine.append(lines) 
    return cleanLine
    

In [238]:
t_answer = CleanLines(application['answer'])

In [239]:
#add token in answer question
def getToken(text):
    token_as = []
    for i in text:
        tokens = tokenizer.tokenize(i)
        token_as.append(tokens)
    #return token_as
    return token_as

In [247]:
t_answer = getToken(t_answer)

In [28]:
#delete stop words and make lowercase ?? not lower
def cleanword(text):
    stop_w = []
    for e in text:
        content = [w.lower() for w in e if w.lower() not in stopwords]
        stop_w.append(content)
    return stop_w

In [29]:
t_answer = cleanword(t_answer)

https://blog.csdn.net/zl_best/article/details/53433072

Word2Vec参数说明：https://blog.csdn.net/szlcw1/article/details/52751314

In [40]:
model = Word2Vec(t_answer, sg=1, size=100,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=5)
model.save('/data/QA/application_a.bin')

In [42]:
model = Word2Vec.load('/data/QA/application_a.bin')

In [43]:
model['computer'] 

  """Entry point for launching an IPython kernel.


array([-0.02150281, -0.00266735, -0.13015597,  0.07824621, -0.06933703,
       -0.08956843,  0.00779961, -0.12313762,  0.14651246, -0.00365247,
       -0.12158015, -0.06919064, -0.03216884,  0.01797071, -0.00170094,
        0.0574705 , -0.12896118, -0.01069618,  0.04712673, -0.25119805,
       -0.15948378, -0.00061552,  0.10650717,  0.09076841, -0.09648447,
       -0.08760342, -0.0484775 , -0.07376955, -0.05841612, -0.0598413 ,
       -0.08822629,  0.07127231, -0.07263125, -0.05387486, -0.06742014,
        0.13302256, -0.15907906,  0.00161945,  0.05186029,  0.01987283,
       -0.03993224,  0.00598737, -0.10475038,  0.12639731,  0.09150562,
       -0.19552936,  0.08593322,  0.06026267,  0.04370295,  0.13887832,
       -0.01136994, -0.05982323, -0.152624  ,  0.00375134, -0.08138216,
        0.01066376, -0.05831333, -0.1850282 , -0.0952899 ,  0.03068771,
       -0.09719677,  0.02842921,  0.07285914,  0.0468903 ,  0.03119747,
       -0.09669027,  0.09115966,  0.06533805,  0.1598633 ,  0.01

In [156]:
type(model)

gensim.models.word2vec.Word2Vec

In [44]:
model.most_similar('computer')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('patio', 0.8586839437484741),
 ('rv', 0.8438653349876404),
 ('switched', 0.8431739807128906),
 ('sat', 0.8376901149749756),
 ('stud', 0.8371859788894653),
 ('Love', 0.8371580839157104),
 ('expecting', 0.834181547164917),
 ('eyes', 0.8295118808746338),
 ('initially', 0.8266298770904541),
 ('meat', 0.8266057372093201)]

## vectorize questions

In [141]:
t_question = CleanLines(application['question'])

In [142]:
t_question = getToken(t_question)

In [143]:
t_question = addstopword(t_question)

In [144]:
model = Word2Vec(t_question, sg=1, size=100,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=5)

In [145]:
model.save('/data/QA/application_q.bin')

相似度应用
https://blog.csdn.net/u014595019/article/details/52218249

# TFlearn

In [162]:
!pip install tflearn
!pip install --upgrade tensorflow
!pip install protobuf

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Requirement already up-to-date: tensorflow in /anaconda2/lib/python2.7/site-packages (1.11.0)
[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [45]:
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import tflearn

In [32]:
input_data = tflearn.input_data(shape=[None, model.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")

AttributeError: 'Word2Vec' object has no attribute 'max_seq_len'

In [50]:
nIn = 100
nHidden = 25
nOut = 200
alpha = 0.2

batchSize = 64


In [53]:
input_layer = tflearn.input_data(shape=[None,nIn])
layer2 = tflearn.fully_connected(input_layer, nHidden, activation = 'sigmoid')
out = tflearn.fully_connected(layer2, nOut, activation = 'softmax')

network = tflearn.regression(out, optimizer='adam', loss='categorical_crossentropy', learning_rate = alpha, batch_size=batchSize)

model = tflearn.RNN(network)

In [None]:
model.fit(trainX, trainY, n_epoch=nEpochs, show_metric=True)

In [None]:
print("Final Accuracy:", model.evaluate(testX, testY))

# Question and Answering

In [29]:
# things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

# things we need for Tensorflow
import numpy as np
import tflearn
import tensorflow as tf
import random
from gensim.models import Word2Vec

In [30]:
# import our intents file
import json

def getdata(path):
    g = open(path,'r')
    for l in g:
        yield eval(l)


intents = list(getdata('/data/QA/QA_Video_Games.json'))


In [31]:
stop_words = nltk.corpus.stopwords.words('english') + [
    '.',
    ',',
    '--',
    '\'s',
    '?',
    ')',
    '(',
    ':',
    '\'',
    '\'re',
    '"',
    '-',
    '}',
    '{',
    u'—',
    '',
    ]

In [32]:
import string
import unicodedata
def CleanLines(text):
    cleanLine = []
    identify = string.maketrans('', '')
    delEStr = string.punctuation +string.digits
    for i in text:         
        i = i.encode("utf-8")
        lines = i.translate(identify,delEStr)
        cleanLine.append(lines) 
    return cleanLine

In [18]:
def uni(text):
    uni_text = []
    for i in text:
        i = unicode(i, "utf-8")
        uni_text.append(i)
    return uni_text

In [19]:
def deuni(text):
    deuni_text = []
    for i in text:
        i = i.encode("utf-8")
        deuni_text.append(i)
    return deuni_text


In [20]:
# delete long words
def long_word_filter(words):
    word_list = []
    for i in words:
        if len(i)<15:
            word_list.append(i)
        
    return word_list

In [21]:
words = []
classes = []
documents = []
answers = []

# loop through each question in our intents
for intent in intents:
    for question in intent['questions']:
        # tokenize each word in the sentence
        #text = CleanLines(question['questionText'])
        w = nltk.word_tokenize(question['questionText'])
        w = CleanLines(w)
        w = uni(w)
        # add to our words list
        words.extend(w)
        documents.append((w, intent['asin']))
        # add to our classes list
        if intent['asin'] not in classes:
            classes.append(intent['asin'])
        
        for answer in question['answers']:
            w2 = nltk.word_tokenize(answer['answerText'])
            w2 = CleanLines(w2)
            w2 = uni(w2)
            words.extend(w2)
            answers.append((w,w2))

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in stop_words]
words = sorted(list(set(words)))
words = deuni(words)
words = long_word_filter(words)

# remove duplicates
classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(classes), "classes")
print (len(words), "unique stemmed words")
print (len(answers), "answers")
        

(7744, 'documents')
(1183, 'classes')
(13772, 'unique stemmed words')
(28893, 'answers')


In [269]:
# create our training data
training = []
output = []

# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    question_words = doc[0]
    # stem each word
    question_words = [stemmer.stem(word.lower()) for word in question_words]
    
    # create our bag of words array
    for w in words:
        bag.append(1) if w in question_words else bag.append(0)
    
    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    
    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])



In [266]:
len(training)

7744

In [33]:
model = Word2Vec(words, sg=1, size=100,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=5)

KeyboardInterrupt: 

In [271]:
model.save('/data/QA/Game.bin')

In [34]:
model_w2v = Word2Vec.load('/data/QA/Game.bin')

In [35]:
type(model_w2v)

gensim.models.word2vec.Word2Vec

In [36]:
import sys
import math
import tflearn
import tensorflow as tf
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import rnn
import chardet
import numpy as np
import struct

In [37]:
question_seqs = []
answer_seqs = []

In [38]:
max_w = 50
float_size = 4
word_vector_dict = {}
word_vec_dim = 100
max_seq_len = 8
word_set = {}

In [39]:
input_file = open('/data/QA/Game.bin', "rb")

In [40]:
words_and_size = input_file.readline()
words_and_size[0]

'\x80'

In [41]:
words_and_size.strip()

'\x80\x02cgensim.models.word2vec'

In [42]:
words = long(words_and_size.split(' ')[0])
words

ValueError: invalid literal for long() with base 10: '\x80\x02cgensim.models.word2vec\n'

In [300]:
model_w2v['a']

  """Entry point for launching an IPython kernel.


array([ 0.0277258 , -0.15726964,  0.23922774, -0.01904267, -0.10565194,
        0.07390157,  0.04503018,  0.01647286,  0.28594834,  0.2821932 ,
       -0.1446266 , -0.02573078, -0.13981101, -0.00779268,  0.00611291,
        0.24701291,  0.04923122,  0.11865059,  0.01996559,  0.05199368,
        0.10256855,  0.12916183,  0.08025157,  0.06846515, -0.13114893,
        0.015908  ,  0.1035057 ,  0.04631276, -0.03442198,  0.15926026,
       -0.13928509, -0.13234141,  0.07017811,  0.09472259, -0.20627746,
       -0.00197215, -0.06064624, -0.03610892,  0.20154928,  0.08365507,
       -0.04813718, -0.06157881, -0.09222899,  0.1056164 , -0.0089402 ,
       -0.08241719, -0.10129223,  0.10501397, -0.13239072, -0.00579549,
       -0.11169685, -0.16966498, -0.04024898, -0.07752605,  0.03528985,
       -0.07535794,  0.18826069, -0.04408138, -0.05332544, -0.07716881,
       -0.00373244, -0.03787516, -0.07968268, -0.16284847, -0.03662417,
       -0.08013629, -0.03653821,  0.08343095,  0.07536754, -0.02

### write question answer pair & Word2Vector

In [107]:
import sys
#reload(sys)
#sys.setdefaultencoding( "utf-8" )

In [43]:
fo = open("/data/QA/QA_pair.txt", "w")

In [44]:
questions = []
for i in range(len(answers)):
    question = deuni(answers[i][0])
    questions.append(question) 



In [45]:
answers1 =[]
for i in range(len(answers)):
    answer = deuni(answers[i][1])
    answers1.append(answer)


In [46]:
# this is for vectorize
words = []
for i in range(len(answers)):
    question = deuni(answers[i][0])
    answer = deuni(answers[i][1])
    words.append(question)
    words.append(answer)



In [47]:
# Word2Vector
model = Word2Vec(words, sg=1, size=100,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=5)
model.save('/data/QA/Game_words.bin')

In [48]:
for i in range(len(questions)):
    for w_q in questions[i]:
        w_q = w_q.lower()
        fo.write( w_q+' ')
        
    fo.write('|')
    
    for w_a in answers1[i]:
        w_a = w_a.lower()
        fo.write(w_a+' ')
    
    fo.write('\n')
            
fo.close()
    

In [49]:
fo = open("/data/QA/QA_pair.txt", "r")
print  len(fo.readlines())

28893


In [27]:
type(model_w2v)

NameError: name 'model_w2v' is not defined

In [131]:
dir(model_w2v)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__ignoreds',
 '__init__',
 '__module__',
 '__new__',
 '__numpys',
 '__recursive_saveloads',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__scipys',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_check_input_data_sanity',
 '_check_training_sanity',
 '_clear_post_train',
 '_do_train_epoch',
 '_do_train_job',
 '_get_job_params',
 '_get_thread_working_mem',
 '_job_producer',
 '_load_specials',
 '_log_epoch_end',
 '_log_epoch_progress',
 '_log_progress',
 '_log_train_end',
 '_minimize_model',
 '_raw_word_count',
 '_save_specials',
 '_set_train_params',
 '_smart_save',
 '_train_epoch',
 '_train_epoch_corpusfile',
 '_update_job_params',
 '_worker_loop',
 '_worker_loop_corpusfile',
 'accuracy',
 'alpha',
 'batch_words',
 'build_vocab',
 'build_vocab_from_freq',
 'callbacks',
 'cbow_mean',
 'clear_s

In [50]:
#load vector
model_w2v = Word2Vec.load('/data/QA/Game_words.bin')
word_vector=model_w2v.wv
#word_vocab = model_w2v.vocabulary
word_vector_dict = word_vector.vocab

In [51]:
word_vector_dict.has_key('hello')

True

In [282]:
word_vector['hello']

array([-0.01847144,  0.2502406 ,  0.15627031,  0.09494841,  0.23786339,
       -0.05098917,  0.03672797,  0.10033745, -0.12306379,  0.40702227,
        0.06122517,  0.1543013 , -0.08173149, -0.5054856 , -0.30113924,
       -0.33783647,  0.47734186, -0.17596556,  0.00078394, -0.14734691,
        0.19012609,  0.14645573, -0.04547663, -0.20754956, -0.08232286,
        0.33851036,  0.2276305 , -0.15445256, -0.6604043 , -0.10005897,
       -0.35435748, -0.31129393,  0.33122015,  0.04844597, -0.31885263,
        0.46004468, -0.3187674 ,  0.14773747, -0.30041856, -0.10771601,
        0.02037544, -0.05955373, -0.01610406, -0.42252117, -0.45313025,
       -0.34191176, -0.33028093, -0.08195064, -0.27875954, -0.31928095,
       -0.07102174,  0.22368523,  0.06321266,  0.07697222,  0.29968724,
        0.21402255,  0.2636317 ,  0.14932515, -0.25810167, -0.06156397,
        0.02456847,  0.42668632,  0.28095636,  0.08311479,  0.00606459,
        0.52734405,  0.4133043 , -0.021176  ,  0.52952296, -0.37

In [52]:
def init_seq(input_file):
    """读取切好词的文本文件，加载全部词序列
    """
    file_object = open(input_file, 'r')
    vocab_dict = {}
    while True:
        question_seq = []
        answer_seq = []
        line = file_object.readline()
        if line:
            line_pair = line.split('|')
            line_question = line_pair[0]
            line_answer = line_pair[1]
            for word in line_question.split(' '):
                if word_vector_dict.has_key(word):
                    question_seq.append(word_vector[word])
            for word in line_answer.decode('utf-8').split(' '):
                if word_vector_dict.has_key(word):
                    answer_seq.append(word_vector[word])
        else:
            break
        question_seqs.append(question_seq)
        answer_seqs.append(answer_seq)
    file_object.close()

In [53]:
import sys
import math
import tflearn
import tensorflow as tf
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import rnn
import chardet
import numpy as np
import struct

In [54]:
# get the vectorized question & answer 
question_seqs=[]
answer_seqs=[]
init_seq('/data/QA/QA_pair.txt')

In [55]:
def get_max_seq_len(seqs):
    seq_lens = []
    for seq in seqs:
        seq_lens.append(len(seq))
        seq_lens.sort(reverse=True)
    return seq_lens[0]


In [56]:
max_q_seq = get_max_seq_len(question_seqs)
max_a_seq = get_max_seq_len(answer_seqs)
print max_q_seq,max_a_seq

556 1928


In [278]:
max_seq_len = max_q_seq + max_a_seq
max_seq_len

2484

In [57]:
class MySeq2Seq(object):
    def __init__(self, max_seq_len = 164, word_vec_dim = 100, input_file='/data/QA/QA_pair.txt'):
        self.max_seq_len = max_seq_len
        self.word_vec_dim = word_vec_dim
        self.input_file = input_file
    
    def generate_trainig_data(self):
 #       load_word_set()
 #       load_vectors("/data/QA/Game_words.bin")
        init_seq(self.input_file)
        xy_data = []
        y_data = []
        for i in range(len(question_seqs)):
        #for i in range(100):
            question_seq = question_seqs[i]
            answer_seq = answer_seqs[i]
            if len(question_seq) < self.max_seq_len and len(answer_seq) < self.max_seq_len:
                #多余的位设为0，与question的reverse合并，为什么要reverse？ * - repeat ；+ - 合并
                sequence_ry = [np.zeros(self.word_vec_dim)] * (self.max_seq_len-len(question_seq)) + list(reversed(question_seq))
                #多余的位设为0， 与answer合并
                sequence_y = answer_seq + [np.zeros(self.word_vec_dim)] * (self.max_seq_len-len(answer_seq))
                #合并
                sequence_xy = sequence_ry + sequence_y
                sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
                xy_data.append(sequence_xy)
                y_data.append(sequence_y)

                #print "right answer"
                #for w in answer_seq:
                #    (match_word, max_cos) = vector2word(w)
                #    if len(match_word)>0:
                #        print match_word, vector_sqrtlen(w)

        return np.array(xy_data), np.array(y_data)
    
    
    def model(self, feed_previous=False):
        # 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
        input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
        encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
        decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
        go_inputs = tf.ones_like(decoder_inputs_tmp)
        go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
        decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")

        # 编码器
        # 把encoder_inputs交给编码器，返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
        (encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, self.word_vec_dim, return_state=True, scope='encoder_lstm')
        encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)

        # 解码器
        # 预测过程用前一个时间序的输出作为下一个时间序的输入
        # 先用编码器的最后一个输出作为第一个输入
        if feed_previous:
            first_dec_input = go_inputs
        else:
            first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
        decoder_output_tensor = tflearn.lstm(first_dec_input, self.word_vec_dim, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
        decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
        decoder_output_sequence_list = [decoder_output_tensor]
        # 再用解码器的输出作为下一个时序的输入
        for i in range(self.max_seq_len-1):
            if feed_previous:
                next_dec_input = decoder_output_sequence_single
            else:
                next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
            decoder_output_tensor = tflearn.lstm(next_dec_input, self.word_vec_dim, return_seq=False, reuse=True, scope='decoder_lstm')
            decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
            decoder_output_sequence_list.append(decoder_output_tensor)

        decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
        real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])

        net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
        model = tflearn.DNN(net)
        return model
    
    
    def train(self):
        trainXY, trainY = self.generate_trainig_data()
        model = self.model(feed_previous=False)
        model.fit(trainXY, trainY, n_epoch=1000, snapshot_epoch=False, batch_size=1)
        model.save('/data/QA/model_tensorflow')
        return model
    
    def load(self):
        model = self.model(feed_previous=True)
        model.load('/data/QA/model_tensorflow')
        return model

In [58]:
test_model = MySeq2Seq( max_seq_len = 164, word_vec_dim = 100, input_file='/data/QA/QA_pair.txt')

In [None]:
test_model.train()

In [None]:
test_model

## another one

In [None]:
class MyLSTM(object):
    def __init__(self):
        self.max_abs_weight = 32  # 最大权重绝对值，用来对词向量做正规化
        self.max_seq_len = 1928  # 最大句子长度(词)
        self.word_vec_dim = 100  # 词向量维度，读vectors.bin二进制时动态确定
        self.epoch = 1000
        self.word_vector_dict = {}  # 词向量词典，加载vectors.bin读入
        self.one_hot_word_vector_dict = {}  # 根据样本词汇生成的softmax用的词向量
        self.word_id_word_dict = {}
        self.one_hot_word_vectors_dim = 1  # softmax用的词向量维度，从1开始，保留0作为EOS的word_id
        self.eos_word_id = 0
        self.eos_word = 'EOS'
        self.vectors_bin_file = './vectors.bin'  # 词向量二进制
        self.model_dir = './model/model'  # 模型文件路径
        self.n_hidden = 1000  # lstm隐藏状态单元数目
        self.learning_rate = 0.01  # 学习率
    