In [1]:
import numpy as np 
import pandas as pd

# Open or Closed Question

In [2]:
df_open = pd.read_csv('open_dataset.csv')
df_open.head()

Unnamed: 0,Id,Title,Tags,Body,Class
0,14106913,Replacement for DoesUserHavePermissions for Li...,<sharepoint>,<p>I'm working on an app that reuses some code...,1
1,14106917,Generating all possible rgb colors,<python><colors>,<p>It seems like it'd be much simpler than it ...,1
2,14106924,iOS capture view with background thread,<ios><screen-capture><ipad-3>,<p>I'm not sure how to efficiently capture the...,1
3,14106970,"Region of interest and Data vertices (3D) , ma...",<matlab><figure>,<p>i want to have a simple function similar to...,1
4,14106981,How to inspect git remote respository,<git>,<p>Is there a way to look at just the log on a...,1


In [3]:
df_close = pd.read_csv('closed_dataset.csv')
df_close.head()

Unnamed: 0,Id,Title,Tags,Body,Class
0,14107010,How to download images from a list of scraped ...,<python><image><screen-scraping><beautifulsoup>,<blockquote>\n <p><strong>Possible Duplicate:...,0
1,14107176,"PHP if statement: use ""OR"" or ||?",<php><if-statement><conditional><pipe>,<blockquote>\n <p><strong>Possible Duplicate:...,0
2,14107308,Use the value of a variable from a procedure i...,<delphi-2009>,<p>can I get the value of a variable declared ...,0
3,14107774,"Java Generics - When to use ""T"" and ""?""",<java><generics>,<p>What is the difference?</p>\n\n<pre><code>p...,0
4,14108699,"How to allow all characters (Chinese, Spanish,...",<mysql>,<blockquote>\n <p><strong>Possible Duplicate:...,0


In [4]:
# combined dataset
df_open_close = pd.concat([df_open, df_close])

## Add space tags

In [5]:
def add_space(text):
  return text.replace(">", " ")

df_open_close['Tags'] = df_open_close['Tags'].apply(add_space)

## Text feature

In [6]:
df_open_close['Text'] = df_open_close['Title']+' '+ df_open_close['Tags']+' '+ df_open_close['Body']

In [7]:
df_open_close['Text']

0        Replacement for DoesUserHavePermissions for Li...
1        Generating all possible rgb colors <python <co...
2        iOS capture view with background thread <ios <...
3        Region of interest and Data vertices (3D) , ma...
4        How to inspect git remote respository <git  <p...
                               ...                        
49995    How can we reuse a Sqlite file in Android whic...
49996    somebody please explain this? <objective-c <me...
49997    Java application to execute commands in comman...
49998    How do you know what system call is invoked wh...
49999    Handling echo messages with multiple forms in ...
Name: Text, Length: 100000, dtype: object

## Cleaning text

In [8]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

df_open_close = df_open_close.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df_open_close['Text'] = df_open_close['Text'].apply(clean_text)
df_open_close['Text'] = df_open_close['Text'].str.replace('\d+', '')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yusuf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
df_open_close['Text']

0        replacement doesuserhavepermissions list objec...
1        generating possible rgb colors python colors p...
2        ios capture view background thread ios screenc...
3        region interest data vertices d matlab matlab ...
4        inspect git remote respository git pis way loo...
                               ...                        
99995    reuse sqlite file android already created ios ...
99996    somebody please explain objectivec methods use...
99997    java application execute commands command prom...
99998    know system call invoked executable file run l...
99999    handling echo messages multiple forms php php ...
Name: Text, Length: 100000, dtype: object

## Stemming

In [10]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

df_open_close['Text'] = df_open_close['Text'].apply(stemSentence)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yusuf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 10000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100 

## Tokenizer

In [12]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_open_close['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 444262 unique tokens.


In [13]:
from keras.preprocessing.sequence import pad_sequences

X_open_close = tokenizer.texts_to_sequences(df_open_close['Text'].values)
X_open_close = pad_sequences(X_open_close, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_open_close.shape)

Shape of data tensor: (100000, 250)


In [14]:
Y_open_close = pd.get_dummies(df_open_close['Class']).values
print('Shape of label tensor:', Y_open_close.shape)

Shape of label tensor: (100000, 2)


## Evalution Function

In [15]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Model


In [16]:
from keras.models import Sequential 
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dense, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping

def create_model_1():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_open_close.shape[1]))
    model.add(GRU(50))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc', f1_m, precision_m, recall_m])
    return model

## Train and Evaluate

In [20]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import numpy as np

n_split=30

for train_index, test_index in KFold(n_split).split(X_open_close):
  x_train,x_test= X_open_close[train_index], X_open_close[test_index]
  y_train,y_test= Y_open_close[train_index], Y_open_close[test_index]
  print(y_test)
  
  model=create_model_1()
  model.fit(x_train, y_train, epochs=1, batch_size=16, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
  accr = model.evaluate(x_test,y_test)
  y_pred = model.predict(x_test)
  print(y_pred)
  matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
  print(matrix)
  print('Loss', accr[0])
  print('Acc', accr[1])
  print('F1', accr[2])
  print('Precision', accr[3])
  print('Recall', accr[4])  

[[0 1]
 [0 1]
 [0 1]
 ...
 [0 1]
 [0 1]
 [0 1]]


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/1
[[0.12602009 0.8739799 ]
 [0.78048724 0.21951273]
 [0.49508923 0.50491077]
 ...
 [0.70219237 0.29780763]
 [0.7367379  0.26326218]
 [0.59101003 0.40898997]]
[[   0    0]
 [1188 2146]]
Loss 0.5935359773838956
Acc 0.6436712741851807
F1 0.6425594687461853
Precision 0.6425595283508301
Recall 0.6425595283508301
[[0 1]
 [0 1]
 [0 1]
 ...
 [0 1]
 [0 1]
 [0 1]]


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 86999 samples, validate on 9667 samples
Epoch 1/1
 1056/86999 [..............................] - ETA: 14:07 - loss: 0.6892 - acc: 0.5199 - f1_m: 0.5199 - precision_m: 0.5199 - recall_m: 0.5199

KeyboardInterrupt: 

# Closed Question Reason 

In [21]:
df_reason = pd.read_csv('closed_reason_dataset.csv')
df_reason.head()

Unnamed: 0,Id,Title,Tags,Body,reason
0,14107648,One-liner for calling a function for each line...,<python><file-io>,<p>I need to keep calling <code>sync</code> fo...,4
1,14107765,Get a string from a file,<python><file-io><python-2.7>,<p>I have a file of titles and passwords forma...,4
2,14108360,where can I get logcat,<android><logcat>,<p>I downloaded the combined Eclipse/ Android ...,2
3,14107010,How to download images from a list of scraped ...,<python><image><screen-scraping><beautifulsoup>,<blockquote>\n <p><strong>Possible Duplicate:...,1
4,14108856,How to color individual items of tr:selectOneC...,<java><html><jsf><jsf-1.2>,<blockquote>\n <p><strong>Possible Duplicate:...,1


## Add space tags

In [22]:
df_open_close['Tags'] = df_open_close['Tags'].apply(add_space)

## Text Feature

In [23]:
df_reason['Text'] = df_reason['Title']+' '+ df_reason['Tags']+' '+ df_reason['Body']
df_reason

Unnamed: 0,Id,Title,Tags,Body,reason,Text
0,14107648,One-liner for calling a function for each line...,<python><file-io>,<p>I need to keep calling <code>sync</code> fo...,4,One-liner for calling a function for each line...
1,14107765,Get a string from a file,<python><file-io><python-2.7>,<p>I have a file of titles and passwords forma...,4,Get a string from a file <python><file-io><pyt...
2,14108360,where can I get logcat,<android><logcat>,<p>I downloaded the combined Eclipse/ Android ...,2,where can I get logcat <android><logcat> <p>I ...
3,14107010,How to download images from a list of scraped ...,<python><image><screen-scraping><beautifulsoup>,<blockquote>\n <p><strong>Possible Duplicate:...,1,How to download images from a list of scraped ...
4,14108856,How to color individual items of tr:selectOneC...,<java><html><jsf><jsf-1.2>,<blockquote>\n <p><strong>Possible Duplicate:...,1,How to color individual items of tr:selectOneC...
...,...,...,...,...,...,...
49963,17300825,Why should developer care whether ios device w...,<iphone><ios><ipad><jailbreak>,<p>According to statistics I've found here <a ...,2,Why should developer care whether ios device w...
49964,17300941,"Evented, Threaded, and Go Routines, why not us...",<java><multithreading><concurrency><jvm><go>,<p>The Evented and Threaded models are quite p...,3,"Evented, Threaded, and Go Routines, why not us..."
49965,17301722,How to copy a string using for loop,<java><android><performance><textview>,<p>I have a string question in which a string ...,4,How to copy a string using for loop <java><and...
49966,17302096,Linq performance: Any vs. Contains,<c#><performance><linq>,"<p>This question is related to <a href=""https:...",1,Linq performance: Any vs. Contains <c#><perfor...


# Cleaning Text


In [24]:
df_reason['Text'] = df_reason['Text'].apply(clean_text)
df_reason['Text'] = df_reason['Text'].str.replace('\d+', '')

# Stemming

In [25]:
df_reason['Text'] = df_reason['Text'].apply(stemSentence)

In [26]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 10000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100 

# Tokenizer

In [27]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_reason['Text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 298994 unique tokens.


In [28]:
X_reason = tokenizer.texts_to_sequences(df_reason['Text'].values)
X_reason = pad_sequences(X_reason, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_reason.shape)

Shape of data tensor: (49968, 250)


In [29]:
Y_reason = pd.get_dummies(df_reason['reason']).values
print('Shape of label tensor:', Y_reason.shape)

Shape of label tensor: (49968, 5)


In [30]:
def create_model_2():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_open_close.shape[1]))
    model.add(GRU(50))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc', f1_m, precision_m, recall_m])
    return model

## Train and Evaluate

In [None]:
from sklearn.model_selection import KFold
import numpy as np

n_split=30

for train_index, test_index in KFold(n_split).split(X_reason):
  x_train,x_test= X_reason[train_index], X_reason[test_index]
  y_train,y_test= Y_reason[train_index], Y_reason[test_index]
  
  model=create_model_2()
  model.fit(x_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
  accr = model.evaluate(x_test,y_test)
  y_pred = model.predict(x_test)
  print(y_pred)
  matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
  print(matrix)
  print('Loss', accr[0])
  print('Acc', accr[1])
  print('F1', accr[2])
  print('Precision', accr[3])
  print('Recall', accr[4])  