In [2]:
import numpy as np 
import pandas as pd

# Open or Closed Question

In [2]:
df_open = pd.read_csv('open_dataset.csv')
df_open.head()

Unnamed: 0,Id,Title,Tags,Body,Class
0,14106913,Replacement for DoesUserHavePermissions for Li...,<sharepoint>,<p>I'm working on an app that reuses some code...,1
1,14106917,Generating all possible rgb colors,<python><colors>,<p>It seems like it'd be much simpler than it ...,1
2,14106924,iOS capture view with background thread,<ios><screen-capture><ipad-3>,<p>I'm not sure how to efficiently capture the...,1
3,14106970,"Region of interest and Data vertices (3D) , ma...",<matlab><figure>,<p>i want to have a simple function similar to...,1
4,14106981,How to inspect git remote respository,<git>,<p>Is there a way to look at just the log on a...,1


In [3]:
df_close = pd.read_csv('closed_dataset.csv')
df_close.head()

Unnamed: 0,Id,Title,Tags,Body,Class
0,14107010,How to download images from a list of scraped ...,<python><image><screen-scraping><beautifulsoup>,<blockquote>\n <p><strong>Possible Duplicate:...,0
1,14107176,"PHP if statement: use ""OR"" or ||?",<php><if-statement><conditional><pipe>,<blockquote>\n <p><strong>Possible Duplicate:...,0
2,14107308,Use the value of a variable from a procedure i...,<delphi-2009>,<p>can I get the value of a variable declared ...,0
3,14107774,"Java Generics - When to use ""T"" and ""?""",<java><generics>,<p>What is the difference?</p>\n\n<pre><code>p...,0
4,14108699,"How to allow all characters (Chinese, Spanish,...",<mysql>,<blockquote>\n <p><strong>Possible Duplicate:...,0


In [4]:
# combined dataset
df_open_close = pd.concat([df_open, df_close])

## Add space tags

In [5]:
def add_space(text):
  return text.replace(">", " ")

df_open_close['Tags'] = df_open_close['Tags'].apply(add_space)

## Text feature

In [6]:
df_open_close['Text'] = df_open_close['Title']+' '+ df_open_close['Tags']+' '+ df_open_close['Body']

In [7]:
df_open_close['Text']

0        Replacement for DoesUserHavePermissions for Li...
1        Generating all possible rgb colors <python <co...
2        iOS capture view with background thread <ios <...
3        Region of interest and Data vertices (3D) , ma...
4        How to inspect git remote respository <git  <p...
                               ...                        
49995    How can we reuse a Sqlite file in Android whic...
49996    somebody please explain this? <objective-c <me...
49997    Java application to execute commands in comman...
49998    How do you know what system call is invoked wh...
49999    Handling echo messages with multiple forms in ...
Name: Text, Length: 100000, dtype: object

## Cleaning text

In [8]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

df_open_close = df_open_close.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df_open_close['Text'] = df_open_close['Text'].apply(clean_text)
df_open_close['Text'] = df_open_close['Text'].str.replace('\d+', '')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yusuf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
df_open_close['Text']

0        replacement doesuserhavepermissions list objec...
1        generating possible rgb colors python colors p...
2        ios capture view background thread ios screenc...
3        region interest data vertices d matlab matlab ...
4        inspect git remote respository git pis way loo...
                               ...                        
99995    reuse sqlite file android already created ios ...
99996    somebody please eplain objectivec methods user...
99997    java application eecute commands command promp...
99998    know system call invoked eecutable file run li...
99999    handling echo messages multiple forms php php ...
Name: Text, Length: 100000, dtype: object

## Stemming

In [10]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

df_open_close['Text'] = df_open_close['Text'].apply(stemSentence)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yusuf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 10000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100 

## Tokenizer

In [12]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_open_close['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 439649 unique tokens.


In [13]:
from keras.preprocessing.sequence import pad_sequences

X_open_close = tokenizer.texts_to_sequences(df_open_close['Text'].values)
X_open_close = pad_sequences(X_open_close, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_open_close.shape)

Shape of data tensor: (100000, 250)


In [14]:
Y_open_close = pd.get_dummies(df_open_close['Class']).values
print('Shape of label tensor:', Y_open_close.shape)

Shape of label tensor: (100000, 2)


## Evalution Function

In [15]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Model


In [18]:
from keras.models import Sequential 
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dense, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping

def create_model_1():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_open_close.shape[1]))
    model.add(GRU(50))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc', f1_m, precision_m, recall_m])
    return model

## Train and Evaluate

In [None]:
from sklearn.model_selection import KFold
import numpy as np

n_split=30

for train_index, test_index in KFold(n_split).split(X_open_close):
  x_train,x_test= X_open_close[train_index], X_open_close[test_index]
  y_train,y_test= Y_open_close[train_index], Y_open_close[test_index]
  
  model=create_model_1()
  model.fit(x_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
  accr = model.evaluate(x_test,y_test)
  print('Loss', accr[0])
  print('Acc', accr[1])
  print('F1', accr[2])
  print('Precision', accr[3])
  print('Recall', accr[4])  

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Loss 1.0093197642362397
Acc 0.5764846801757812
F1 0.5771825313568115
Precision 0.5771825313568115
Recall 0.5771825313568115


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Loss 1.1373500037350623
Acc 0.5341931581497192
F1 0.5326388478279114
Precision 0.5326388478279114
Recall 0.5326388478279114


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.8529351149480645
Acc 0.6475704908370972
F1 0.6490079164505005
Precision 0.6490079760551453
Recall 0.6490079760551453


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Loss 0.8180234616719539
Acc 0.6310737729072571
F1 0.6300594806671143
Precision 0.630059540271759
Recall 0.630059540271759


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Loss 1.1082545032598476
Acc 0.6346730589866638
F1 0.6349205374717712
Precision 0.634920597076416
Recall 0.634920597076416


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.7605174557134358
Acc 0.6793641448020935
F1 0.6792657971382141
Precision 0.6792658567428589
Recall 0.6792658567428589


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Loss 0.6563790688345943
Acc 0.722255527973175
F1 0.7231150269508362
Precision 0.723115086555481
Recall 0.723115086555481


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.7810211450523006
Acc 0.6973605155944824
F1 0.6945436000823975
Precision 0.6945436596870422
Recall 0.6945436596870422


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Loss 0.498595490649423
Acc 0.7558488249778748
F1 0.7551586627960205
Precision 0.7551587224006653
Recall 0.7551587224006653


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Loss 0.5902820699359388
Acc 0.7057588696479797
F1 0.7054562568664551
Precision 0.7054563164710999
Recall 0.7054563164710999


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.7135956153081339
Acc 0.7347734570503235
F1 0.7352975606918335
Precision 0.7352976202964783
Recall 0.7352976202964783


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Loss 0.5950729338088409
Acc 0.7080708146095276
F1 0.7104166150093079
Precision 0.7104166746139526
Recall 0.7104166746139526


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.7627969770648978
Acc 0.696069598197937
F1 0.6952975392341614
Precision 0.6952975988388062
Recall 0.6952975988388062


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.8257125138547947
Acc 0.6726672649383545
F1 0.6736904382705688
Precision 0.6736904978752136
Recall 0.6736904978752136


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.9729812183860826
Acc 0.6468647122383118
F1 0.6432737112045288
Precision 0.6432737708091736
Recall 0.6432737708091736


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Loss 1.6229536149224968
Acc 0.4626462757587433
F1 0.4637499451637268
Precision 0.4637499749660492
Recall 0.4637499749660492


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Loss 0.6632307194533235
Acc 0.7275727391242981
F1 0.7233332395553589
Precision 0.7233332991600037
Recall 0.7233332991600037


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Loss 0.6640354142849273
Acc 0.6864686608314514
F1 0.6873809099197388
Precision 0.6873809695243835
Recall 0.6873809695243835


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Loss 0.7120021357394681
Acc 0.7056705951690674
F1 0.7032142281532288
Precision 0.7032142877578735
Recall 0.7032142877578735


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Loss 0.8982575458697718
Acc 0.6924692392349243
F1 0.6933332681655884
Precision 0.6933333873748779
Recall 0.6933333873748779


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Loss 0.6975961494402881
Acc 0.724572479724884
F1 0.7219642400741577
Precision 0.7219642996788025
Recall 0.7219642996788025


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Loss 0.6708146343351377
Acc 0.7026702761650085
F1 0.7018451690673828
Precision 0.7018452286720276
Recall 0.7018452286720276


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Loss 0.9545411070724382
Acc 0.6933693289756775
F1 0.6942261457443237
Precision 0.6942262053489685
Recall 0.6942262053489685


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 87000 samples, validate on 9667 samples
Epoch 1/20
Epoch 2/20

# Closed Question Reason 

In [1]:
df_reason = pd.read_csv('closed_reason_dataset.csv')
df_reason.head()

NameError: name 'pd' is not defined

## Add space tags

In [None]:
df_open_close['Tags'] = df_open_close['Tags'].apply(add_space)

## Text Feature

In [None]:
df_reason['Text'] = df_reason['Title']+' '+ df_reason['Tags']+' '+ df_reason['Body']
df_reason

# Cleaning Text


In [None]:
df_reason['Text'] = df_reason['Text'].apply(clean_text)
df_reason['Text'] = df_reason['Text'].str.replace('\d+', '')

# Stemming

In [None]:
df_reason['Text'] = df_reason['Text'].apply(stemSentence)

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 10000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100 

# Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_reason['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X_reason = tokenizer.texts_to_sequences(df_reason['Text'].values)
X_reason = pad_sequences(X_reason, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_reason.shape)

In [None]:
Y_reason = pd.get_dummies(df_reason['reason']).values
print('Shape of label tensor:', Y_reason.shape)

In [None]:
def create_model_2():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_open_close.shape[1]))
    model.add(GRU(50))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc', f1_m, precision_m, recall_m])
    return model

## Train and Evaluate

In [None]:
for train_index, test_index in KFold(n_split).split(X_reason):
  x_train,x_test= X_reason[train_index], X_reason[test_index]
  y_train,y_test= Y_reason[train_index], Y_reason[test_index]
  
  model=create_model_2()
  model.fit(x_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
  accr = model.evaluate(x_test,y_test)
  print('Loss', accr[0])
  print('Acc', accr[1])
  print('F1', accr[2])
  print('Precision', accr[3])
  print('Recall', accr[4])  