In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Question Answering Analysis

## Content

### Load datasets

In [184]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
from pylab import rcParams

%matplotlib inline

In [185]:
sns.set(style="ticks")
sns.set_style("whitegrid")
rcParams['figure.dpi'] = 350
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['patch.edgecolor'] = 'white'
rcParams['font.family'] = 'StixGeneral'
rcParams['figure.figsize'] = 15,10
rcParams['font.size'] = 20
rcParams['axes.labelsize'] = 'large'
rcParams['xtick.labelsize'] = 20
rcParams['ytick.labelsize'] = 20

In [186]:
import os
import sys
import gzip
import json
import nltk
from nltk import clean_html
from urllib.request import urlopen

In [187]:
from textblob import TextBlob, Word

In [188]:
import pandas as pd
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [189]:
application = getDF('../Question_Answering/datasets/Appliances.json.gz')

In [190]:
application.head()

Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer
0,yes/no,B00004U9JP,"Jun 27, 2014",1403852000.0,I have a 9 year old Badger 1 that needs replac...,?,I replaced my old one with this without a hitch.
1,open-ended,B00004U9JP,"Apr 28, 2014",1398668000.0,model number,,This may help InSinkErator Model BADGER-1: Bad...
2,yes/no,B00004U9JP,"Aug 25, 2014",1408950000.0,can I replace Badger 1 1/3 with a Badger 5 1/2...,?,Plumbing connections will vary with different ...
3,yes/no,B00004U9JP,"Nov 3, 2014",1415002000.0,Does this come with power cord and dishwasher ...,?,It does not come with a power cord. It does co...
4,open-ended,B00004U9JP,"Jun 21, 2014",1403334000.0,loud noise inside when turned on. sounds like ...,,Check if you dropped something inside.Usually ...


In [191]:
application.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9011 entries, 0 to 9010
Data columns (total 7 columns):
questionType    9011 non-null object
asin            9011 non-null object
answerTime      9011 non-null object
unixTime        8590 non-null float64
question        9011 non-null object
answerType      4693 non-null object
answer          9011 non-null object
dtypes: float64(1), object(6)
memory usage: 563.2+ KB


In [192]:
application.isnull().sum()

questionType       0
asin               0
answerTime         0
unixTime         421
question           0
answerType      4318
answer             0
dtype: int64

In [193]:
application = application.fillna(0)

In [194]:
#Choose one product ID analysis
ap9LP = application.loc[application['asin']=='B00004U9JP',['asin','question','answer','answerType','questionType']]

In [253]:
ap9LP

Unnamed: 0,asin,question,answer,answerType,questionType
0,B00004U9JP,I have a 9 year old Badger 1 that needs replac...,I replaced my old one with this without a hitch.,?,yes/no
1,B00004U9JP,model number,This may help InSinkErator Model BADGER-1: Bad...,0,open-ended
2,B00004U9JP,can I replace Badger 1 1/3 with a Badger 5 1/2...,Plumbing connections will vary with different ...,?,yes/no
3,B00004U9JP,Does this come with power cord and dishwasher ...,It does not come with a power cord. It does co...,?,yes/no
4,B00004U9JP,loud noise inside when turned on. sounds like ...,Check if you dropped something inside.Usually ...,0,open-ended
5,B00004U9JP,where is the reset button located,on the bottom,0,open-ended


In [195]:
from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer
from textblob import Blobber
tba = Blobber(analyzer=NaiveBayesAnalyzer())

In [196]:
from nltk.corpus import webtext
webtext.fileids()
from nltk.corpus import nps_chat
nps_chat.fileids()

['10-19-20s_706posts.xml',
 '10-19-30s_705posts.xml',
 '10-19-40s_686posts.xml',
 '10-19-adults_706posts.xml',
 '10-24-40s_706posts.xml',
 '10-26-teens_706posts.xml',
 '11-06-adults_706posts.xml',
 '11-08-20s_705posts.xml',
 '11-08-40s_706posts.xml',
 '11-08-adults_705posts.xml',
 '11-08-teens_706posts.xml',
 '11-09-20s_706posts.xml',
 '11-09-40s_706posts.xml',
 '11-09-adults_706posts.xml',
 '11-09-teens_706posts.xml']

In [249]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [250]:
stop_words = nltk.corpus.stopwords.words('english') + [
    '.',
    ',',
    '--',
    '\'s',
    '?',
    ')',
    '(',
    ':',
    '\'',
    '\'re',
    '"',
    '-',
    '}',
    '{',
    u'—',
    ]

In [251]:
from nltk.tokenize import RegexpTokenizer
#w means tokens are made of only alphanumeric characters where + indicates that they comprise of one or more of such characters
tokenizer = RegexpTokenizer('\w+')

In [252]:
#add token in answer question
def getToken(text):
    token_as = []
    for i in text:
        tokens = tokenizer.tokenize(i)
        token_as.append(tokens)
    #return token_as
    return token_as

In [205]:
tf = getToken(ap9LP['answer'])

In [207]:
#add stop words
def addstopword(text):
    stop_w = []
    for e in text:
        stopwords = nltk.corpus.stopwords.words('english')
        content = [w for w in e if w.lower() not in stopwords]
        stop_w.append(content)
    return stop_w

In [208]:
tfs = addstopword(tf)

In [211]:
#Text Normalization
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [212]:
def checkWord(text):
    rightW = []
    for w in text:
        stemmed_tokens = [snowball_stemmer.stem(t) for t in w]
        rightW.append(w)
    return rightW

In [214]:
allCheck = checkWord(tfs)

In [243]:
#Text Semantic Analysis
def textSemantic(text):
    txS = []
    for s in text:
        tagged_tokens = [nltk.pos_tag(s) for token in tokens] 
        pos_tagged_chunks = [nltk.ne_chunk(chunk) for chunk in tagged_tokens] 
        txS.append(pos_tagged_chunks)
    return txS

In [245]:
textSemAna = textSemantic(allCheck)

In [259]:
#Collocations and n-grams
from nltk import bigrams
bgrs = allCheck[0]
for i in range(4):
    print (bgrs.__next__())

AttributeError: 'list' object has no attribute '__next__'