In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation, Dropout, Input, LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

import numpy as np
from numpy import array
import pandas as pd

from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

We will do the import through google drive here due to big csv file

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
path = "/content/drive/MyDrive/Questions.csv"
df_questions = pd.read_csv(path,encoding='ISO-8859-1')

In [4]:
path2 = "/content/drive/MyDrive/Tags.csv"
df_tags = pd.read_csv(path2,encoding='ISO-8859-1')

In [5]:
df_questions.shape, df_tags.shape

((1264216, 7), (3750994, 2))

In [6]:
df_questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [7]:
df_tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [8]:
top_10_tags=df_tags['Tag'].value_counts().index[:10]
top_10_tags

Index(['javascript', 'java', 'c#', 'php', 'android', 'jquery', 'python',
       'html', 'c++', 'ios'],
      dtype='object')

In [9]:
# get top 10 most frequent names,now we will take below top 10 frequest tags in our dataset for our model
# What exactly does adding .index does? Why can't I leave it till [:n]?
# The returned data structure will have the name values stored in the index, 
# with their respective counts stored as the value. So if you didn't use index,
# you'd get a list of the most frequent counts, not the associated name.
# n = 10
# df_tags['Tag'].value_counts()[:n].index.tolist()

In [10]:
tags=df_tags[df_tags['Tag'].isin(top_10_tags)]
tags.head()
# this way is better than writing 10 or conditions

Unnamed: 0,Id,Tag
14,260,c#
18,330,c++
28,650,c#
35,930,c#
39,1010,c#


In [11]:
# https://www.ritchieng.com/pandas-multi-criteria-filtering/
# filtering data to get 10 most occuring tags.
# df_filter_tag = df_tags[(df_tags.Tag == 'javascript') | (df_tags.Tag == 'Java') | (df_tags.Tag == 'c#') | 
# (df_tags.Tag == 'php') | (df_tags.Tag == 'android') | (df_tags.Tag == 'jquery') | (df_tags.Tag == 'python') |
# (df_tags.Tag == 'html') | (df_tags.Tag == 'c++') | (df_tags.Tag == 'ios')].reset_index(drop=True)


In [12]:
tags.shape

(826739, 2)

Before we merge both dataframes. We will have to group tags by the id since a ID can have multiple tags. We will use the groupeby function and then merge the dataframes on the id.

In [13]:
# df_tags_new=df_filter_tag.groupby('Id')['Tag'].apply(list).reset_index()--# this creates list which create problem of
# TypeError: unhashable type: 'list'.
tags = tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags))
tags.head()

Id
260      c#
330     c++
650      c#
930      c#
1010     c#
Name: Tag, dtype: object

In [14]:
tag = pd.DataFrame({'Id':tags.index, 'Tag':tags.values})
tag.head()

Unnamed: 0,Id,Tag
0,260,c#
1,330,c++
2,650,c#
3,930,c#
4,1010,c#


In [15]:
tag.shape

# around 1 lakh rows reduced

(706336, 2)

In [16]:
df_combine = pd.merge(tag, df_questions, how ='left', on ='Id')
df_combine.head()

Unnamed: 0,Id,Tag,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,260,c#,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
1,330,c++,63.0,2008-08-02T02:51:36Z,,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
2,650,c#,143.0,2008-08-03T11:12:52Z,,79,Automatically update version number,<p>I would like the version property of my app...
3,930,c#,245.0,2008-08-04T00:47:25Z,,28,How do I connect to a database and loop over a...,<p>What's the simplest way to connect and quer...
4,1010,c#,67.0,2008-08-04T03:59:42Z,,14,"How to get the value of built, encoded ViewState?",<p>I need to grab the base64-encoded represent...


In [17]:
df_combine.shape

(706336, 8)

Now, we will take only quesions witha score greater than 5. We doing that for 2 reasons:

1- We don't have to deal with "Your session crashed after using all available RAM in Google Collab"

2- The posts will probably be with a better quality and will be better tagged since they have lots of upvotes.

In [18]:
new_df = df_combine[df_combine.Score>5]

In [19]:
new_df.shape

(39857, 8)

In [20]:
drop_cols=['Id','OwnerUserId','CreationDate','ClosedDate','Score']

In [21]:
new_df.drop(drop_cols,1,inplace=True)  # 1 is axis as we are dropping columns
new_df.head()

Unnamed: 0,Tag,Title,Body
0,c#,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
1,c++,Should I use nested classes in this case?,<p>I am working on a collection of classes use...
2,c#,Automatically update version number,<p>I would like the version property of my app...
3,c#,How do I connect to a database and loop over a...,<p>What's the simplest way to connect and quer...
4,c#,"How to get the value of built, encoded ViewState?",<p>I need to grab the base64-encoded represent...


In [22]:
X = new_df['Body']
Y = new_df['Tag']

Preprocessing and Cleaning



In [23]:
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import warnings

import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from sklearn.preprocessing import LabelEncoder

In the next two columns: Body and Title, we will use lots of text processing:

Removing html format
Lowering text
Transforming abbreviations
Removing punctuation (but keeping words like c# since it's the most popular tag)
Lemmatizing words
Removing stop words

In [24]:
 # Converting html to text in the body

X = X.apply(lambda x: BeautifulSoup(x).get_text()) 

In [25]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [26]:
X = X.apply(lambda x: clean_text(x))


In [27]:
X.head()

0    i have a little game written in c#. it uses a ...
1    i am working on a collection of classes used f...
2    i would like the version property of my applic...
3    what is the simplest way to connect and query ...
4    i need to grab the base64-encoded representati...
Name: Body, dtype: object

In [28]:
X[1]

'i am working on a collection of classes used for video playback and recording. i have one main class which acts like the public interface, with methods like play(), stop(), pause(), record() etc... then i have workhorse classes which do the video decoding and video encoding. i just learned about the existence of nested classes in c++, and i am curious to know what programmers think about using them. i am a little wary and not really sure what the benefits/drawbacks are, but they seem (according to the book i am reading) to be used in cases such as mine. the book suggests that in a scenario like mine, a good solution would be to nest the workhorse classes inside the interface class, so there are no separate files for classes the client is not meant to use, and to avoid any possible naming conflicts? i do not know about these justifications. nested classes are a new concept to me. just want to see what programmers think about the issue.'

In [29]:
# Using multilabelbinarizer to convert the list to target columns as its multilable problem
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [30]:
mlb.fit(Y)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [31]:
Y =  mlb.transform(Y)

In [32]:
Y

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0]])

In [33]:
Y.shape        # 22 here means there is 22 unique values, final layer in architecture will be 22 nodes

(39857, 22)

In [34]:
(Y.sum(axis=1)>1).sum() # you can see there around 10% cases where
# there are multiple tags
# but there is no changes happening

39857

In [35]:
# splitting the data into train and text
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20, random_state = 42)

In [36]:
#tokenizer to read all the words present in our corpus, that is all the unique words are getting assigned to numbers

# token = Tokenizer()
token = Tokenizer(char_level=False,split=' ')  #if I wanted to split alphabet wise then char_level=True and Split=''
token.fit_on_texts(X_train)

In [37]:
token.word_index

{'the': 1,
 'i': 2,
 'to': 3,
 'a': 4,
 'is': 5,
 'in': 6,
 'and': 7,
 'this': 8,
 'of': 9,
 'it': 10,
 'not': 11,
 'that': 12,
 'for': 13,
 'have': 14,
 '0': 15,
 'if': 16,
 'am': 17,
 '1': 18,
 'but': 19,
 'my': 20,
 'with': 21,
 'on': 22,
 'android': 23,
 'be': 24,
 'can': 25,
 'do': 26,
 'class': 27,
 'as': 28,
 'new': 29,
 'java': 30,
 'at': 31,
 'an': 32,
 'from': 33,
 'code': 34,
 'string': 35,
 '2': 36,
 'public': 37,
 'using': 38,
 'so': 39,
 'or': 40,
 'would': 41,
 'like': 42,
 'how': 43,
 'are': 44,
 'get': 45,
 'there': 46,
 'when': 47,
 'what': 48,
 "'": 49,
 'function': 50,
 'id': 51,
 'int': 52,
 'return': 53,
 'name': 54,
 'file': 55,
 'use': 56,
 'c': 57,
 'data': 58,
 'any': 59,
 'which': 60,
 'does': 61,
 'error': 62,
 'some': 63,
 'all': 64,
 'want': 65,
 '3': 66,
 'you': 67,
 'com': 68,
 'type': 69,
 'app': 70,
 'way': 71,
 'value': 72,
 'here': 73,
 'one': 74,
 'will': 75,
 'void': 76,
 'by': 77,
 'method': 78,
 'var': 79,
 '4': 80,
 'e': 81,
 'object': 82,
 '5':

In [38]:
#declaring the vocab_size, the number of unique words

vocab_size  = len(token.word_index) + 1
vocab_size

147322

In [39]:
#conversion to numerical formats, here all the assigned numbers sequenced as per the sentence.
encoded_text = token.texts_to_sequences(X_train)

In [40]:
# Sequences are nothing but list of list numerical representation of X
print(encoded_text[0:1])

[[2, 94, 3, 14, 74, 456, 6, 20, 295, 3602, 77, 295, 28, 921, 9, 788, 921, 9, 79833, 2, 17, 38, 34, 151, 291, 3, 145, 20, 295, 73, 5, 48, 2, 679, 37, 27, 15336, 197, 37, 52, 1872, 45, 84, 37, 347, 53969, 45, 84, 37, 27, 6221, 197, 37, 52, 1872, 45, 84, 37, 347, 53970, 45, 84, 37, 27, 79834, 197, 37, 52, 1872, 45, 84, 37, 347, 921, 45, 84, 8, 691, 3, 24, 3602, 77, 432, 28, 288, 921, 53969, 33, 15336, 136, 1872, 8, 1872, 288, 921, 53970, 33, 6221, 136, 1872, 8, 1872, 43, 25, 2, 829, 8, 6, 2448, 41933]]


In [41]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [42]:
import nltk
from nltk.tokenize import word_tokenize

In [43]:
# here we are trying to find sentence with maximum words
sent_lens=[]
for sent in X_train:
    sent_lens.append(len(word_tokenize(sent)))

In [44]:
max(sent_lens)

10972

In [45]:
# here 95% of the training example(data) has word count less than 622
np.quantile(sent_lens,0.95)

622.0

In [46]:
# 'max_length' = 622 means we are considering max 622 words or token only which is nothing but time steps.
# padding='post' means that we padding post the sentence(keeping values 0 if the tokens are not there)
# sequence are nothing but list of list numerical representation of my X
# to the sequence obtained above we pad them and convert every sequence to max_length
# default is padding = 'pre', for post padding we can mention padding = 'post', we should consider default padding

max_length = 622
sequences_matrix_train = pad_sequences(encoded_text, maxlen=max_length)

In [47]:
print(sequences_matrix_train)

[[    0     0     0 ...     6  2448 41933]
 [    0     0     0 ...   355   162   396]
 [    0     0     0 ...   297     7  6041]
 ...
 [    0     0     0 ...    15    18    15]
 [    0     0     0 ...  2359     8   162]
 [    0     0     0 ...    71   453     8]]


In [48]:
sequences_matrix_train.shape

(31885, 622)

In [49]:
pd.DataFrame(sequences_matrix_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,84,37,347,921,45,84,8,691,3,24,3602,77,432,28,288,921,53969,33,15336,136,1872,8,1872,288,921,53970,33,6221,136,1872,8,1872,43,25,2,829,8,6,2448,41933
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,101,101,7,977,101,158,67,519,14,3,2438,1,150,4,576,598,3,79836,1,325,2,17,11,324,19,10,180,12,1,301,643,230,94,3,24,303,28,355,162,396
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,9,5687,314,190,3,1996,32,1019,1418,131,225,5,8,180,2,17,438,38,4412,88,333,4412,209,175,10,5,763,3,499,22,1019,131,19,41,1482,5687,1067,297,7,6041
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1075,7,65,3,84,1,6820,9,225,3,1,151,456,9,53973,7,1,149,9,946,3,1,367,28,2,17,29,3,200,372,1467,1828,1,1418,71,25,63,10136,1384,162
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,8,5,1,221,645,19,3,86,8,53976,9,79837,572,48,425,26,2,14,3,829,48,2,65,41,1,56,9,3458,191,73,156,61,9348,1,27,28,1147,1138,1,325


Model building

In [50]:
def RNN():
    inputs = Input(name='inputs',shape=[max_length])
    layer = Embedding(input_dim= vocab_size+1,output_dim= 300,input_length=max_length,mask_zero=True)(inputs)
    layer = LSTM(64)(layer)  ## (num_of_obs, number of timestep,feature_dimensions)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(22,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [51]:
model = RNN()
model.summary()
model.compile(loss='categorical_crossentropy',optimizer='adam',
              metrics=['accuracy'])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 622)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 622, 300)          44196900  
_________________________________________________________________
lstm (LSTM)                  (None, 64)                93440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 22)                5654  

In [52]:
encoded_text_test = token.texts_to_sequences(X_test)
sequences_matrix_test = pad_sequences(encoded_text_test, maxlen=max_length)

In [None]:
model.fit(sequences_matrix_train,Y_train,epochs=10,
          validation_data=(sequences_matrix_test,Y_test))

Epoch 1/10