# 1. Required imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
# Text specific libraries import
import nltk

# 2. Data Upload

In [2]:
data = pd.read_csv('stack-overflow-data.csv')

In [4]:
data.head(5)

Unnamed: 0,post,tags
0,what is causing this behavior in our c# datet...,c#
1,have dynamic html load as if it was in an ifra...,asp.net
2,how to convert a float value in to min:sec i ...,objective-c
3,.net framework 4 redistributable just wonderi...,.net
4,trying to calculate and print the mean and its...,python


In [5]:
text = data['post'].tolist()

In [7]:
text[:1]

['what is causing this behavior  in our c# datetime type  <pre><code>[test] public void sadness() {    var datetime = datetime.utcnow;    assert.that(datetime  is.equalto(datetime.parse(datetime.tostring()))); } </code></pre>   failed :   <pre><code> expected: 2011-10-31 06:12:44.000  but was:  2011-10-31 06:12:44.350 </code></pre>   i wish to know what is happening behind the scenes in tostring() etc to cause this behavior.    edit after seeing jon s answer :   <pre><code>[test] public void newsadness() {     var datetime = datetime.utcnow;     assert.that(datetime  is.equalto(datetime.parse(datetime.tostring( o )))); } </code></pre>   result :   <pre><code>expected: 2011-10-31 12:03:04.161 but was:  2011-10-31 06:33:04.161 </code></pre>   same result with capital and small  o  . i m reading up the docs  but still unclear.']

In [8]:
target = data['tags']

In [9]:
target.head()

0             c#
1        asp.net
2    objective-c
3           .net
4         python
Name: tags, dtype: object

# 3. Data Preprocessing

## 3.1 Text data Cleaning

In [13]:
# Use gensim to preprocess the data
# This single gensim function will:
# strip tags,
# strip punctuation,
# strip multiple whitespaces,
# strip numeric,
# remove stopwords,
# strip short,
# stem text
from gensim.parsing.preprocessing import preprocess_string

In [16]:
text_preprocessed = [preprocess_string(text_1) for text_1 in text]

In [75]:
print(text_preprocessed[:2])

[['caus', 'behavior', 'datetim', 'type', 'test', 'public', 'void', 'sad', 'var', 'datetim', 'datetim', 'utcnow', 'assert', 'datetim', 'equalto', 'datetim', 'pars', 'datetim', 'tostr', 'fail', 'expect', 'wish', 'know', 'happen', 'scene', 'tostr', 'caus', 'behavior', 'edit', 'see', 'jon', 'answer', 'test', 'public', 'void', 'newsad', 'var', 'datetim', 'datetim', 'utcnow', 'assert', 'datetim', 'equalto', 'datetim', 'pars', 'datetim', 'tostr', 'result', 'expect', 'result', 'capit', 'small', 'read', 'doc', 'unclear'], ['dynam', 'html', 'load', 'ifram', 'asp', 'net', 'site', 'user', 'save', 'entir', 'html', 'page', 'backend', 'databas', 'want', 'load', 'dynam', 'content', 'div', 'exist', 'page', 'content', 'area', 'coupl', 'thing', 'happen', 'want', 'css', 'affect', 'outsid', 'div', 'try', 'load', 'badli', 'form', 'html', 'imag', 'div', 'outsid', 'content', 'area', 'lot', 'html', 'page', 'us', 'base', 'tag', 'imag', 'link', 'want', 'base', 'tag', 'respect', 'insid', 'div', 'solut', 'go', 'tr

In [34]:
corpus_size = len(text_preprocessed)
print(corpus_size)

40000


# 4. Data Preperation

## 4.1 Target Encoding

In [30]:
from sklearn.preprocessing import LabelEncoder

In [31]:
lbl_enc = LabelEncoder()

In [35]:
enc_target = lbl_enc.fit_transform(target)

In [39]:
# Convert to DataFrame
target_df = pd.DataFrame(enc_target, columns=['target'])
print(target_df.head())

   target
0       5
1       3
2      15
3       0
4      17


## 4.2 Text Encoding

In [60]:
from gensim import corpora

In [65]:
idx_to_word = corpora.Dictionary(text_preprocessed)

In [77]:
idx_to_word[0]

'answer'

In [78]:
embed_text = [idx_to_word.doc2bow(text_preprocessed_1) for text_preprocessed_1 in text_preprocessed]

In [80]:
print(embed_text[:1])

[[(0, 1), (1, 2), (2, 2), (3, 1), (4, 2), (5, 11), (6, 1), (7, 1), (8, 2), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 2), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 3), (25, 1), (26, 1), (27, 2), (28, 2), (29, 2), (30, 1)]]


In [87]:
[[(idx_to_word[id], freq) for id,freq in embed_text_1] for embed_text_1 in embed_text[:1]]

[[('answer', 1),
  ('assert', 2),
  ('behavior', 2),
  ('capit', 1),
  ('caus', 2),
  ('datetim', 11),
  ('doc', 1),
  ('edit', 1),
  ('equalto', 2),
  ('expect', 2),
  ('fail', 1),
  ('happen', 1),
  ('jon', 1),
  ('know', 1),
  ('newsad', 1),
  ('pars', 2),
  ('public', 2),
  ('read', 1),
  ('result', 2),
  ('sad', 1),
  ('scene', 1),
  ('see', 1),
  ('small', 1),
  ('test', 2),
  ('tostr', 3),
  ('type', 1),
  ('unclear', 1),
  ('utcnow', 2),
  ('var', 2),
  ('void', 2),
  ('wish', 1)]]

In [81]:
print('Number of unique tokens: %d' % len(idx_to_word))
print('Number of documents: %d' % len(embed_text))

Number of unique tokens: 107897
Number of documents: 40000


In [88]:
from sklearn.model_selection import train_test_split

In [90]:
X_train, X_test, y_train, y_test = train_test_split(embed_text, target_df, test_size=0.20, random_state=42)

In [91]:
from sklearn.feature_extraction.text import TfidfTransformer