In [2]:
import numpy as np
import pandas as pd
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
try:
    from nltk.corpus import stopwords
except:
    nltk.download('stopwords')
    
from tqdm import tqdm_notebook

import tensorflow as tf

In [3]:
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text.strip())
    
    stemmer = nltk.stem.PorterStemmer()
    return ' '.join(stemmer.stem(token) for token in tokens)

In [4]:
sample_size = 200000
stackoverflow_df = pd.read_csv('./data/tagged_posts.tsv', sep='\t').sample(sample_size, random_state=0)
stackoverflow_df['title'] = stackoverflow_df['title'].apply(text_prepare)
stackoverflow_df.head()

Unnamed: 0,post_id,title,tag
2168983,43837842,effici algorithm compos valid express specif t...,python
1084095,15747223,basic thread program fail clang pass g++,c_cpp
1049020,15189594,link scroll top work,javascript
200466,3273927,possibl implement ping window phone 7,c#
1200249,17684551,glsl normal map issu,c_cpp


In [5]:
def prepare_file(df, out_):
    out = open(out_, 'w')
    for line, tag in tqdm_notebook(zip(df['title'], df['tag'])):
        line = line.strip().replace(' ', '\t')
        out.write(line + '\t__label__{}\r\n'.format(tag))
        
    out.close()

In [6]:
prepare_file(stackoverflow_df, './data/tagged_posts_stemmed.tsv')




In [12]:
!starspace train -trainFile ./data/tagged_posts_stemmed.tsv -model modelSave -trainMode 0 -minCount 2 -verbose 1 -dim 100 -fileFormat fastText -negSearchLimit 10 -lr 0.05

Arguments: 
lr: 0.05
dim: 100
epoch: 5
maxTrainTime: 8640000
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
thread: 10
minCount: 2
minCountLabel: 1
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 0
fileFormat: fastText
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : ./data/tagged_posts_stemmed.tsv
Read 1M words
Number of words in dictionary:  14925
Number of labels in dictionary: 10
Loading data from file : ./data/tagged_posts_stemmed.tsv
Total number of examples loaded : 199884
Initialized model weights. Model size :
matrix : 14935 100
Training epoch 0: 0.05 0.01
Epoch: 100.0%  lr: 0.040000  loss: 0.022949  eta: 0h1m  tot: 0h0m23s  (20.0%)1%  lr: 0.050000  loss: 0.113522  eta: <1min   tot: 0h0m0s  (0.6%)3.6%  lr: 0.049950  loss: 0.107749  eta: <1min   tot: 0h0m0s  (0.7%)4.0%  lr: 0.049899  loss: 0.102146  eta: <1min   tot: 0h0m0s  (0.8%)5.8%  lr: 0.049749

Epoch: 100.0%  lr: 0.020000  loss: 0.008753  eta: <1min   tot: 0h1m14s  (60.0%)8%  lr: 0.029899  loss: 0.009271  eta: <1min   tot: 0h0m49s  (40.4%)2.2%  lr: 0.029849  loss: 0.009315  eta: <1min   tot: 0h0m49s  (40.4%)4.9%  lr: 0.029548  loss: 0.008446  eta: <1min   tot: 0h0m50s  (41.0%)6.3%  lr: 0.029397  loss: 0.008311  eta: 0h1m  tot: 0h0m50s  (41.3%)8.6%  lr: 0.029246  loss: 0.008195  eta: 0h1m  tot: 0h0m51s  (41.7%)11.3%  lr: 0.028945  loss: 0.008255  eta: 0h1m  tot: 0h0m51s  (42.3%)11.7%  lr: 0.028844  loss: 0.008432  eta: 0h1m  tot: 0h0m51s  (42.3%)14.0%  lr: 0.028693  loss: 0.008481  eta: 0h1m  tot: 0h0m52s  (42.8%)15.8%  lr: 0.028593  loss: 0.008554  eta: 0h1m  tot: 0h0m52s  (43.2%)16.7%  lr: 0.028492  loss: 0.008574  eta: 0h1m  tot: 0h0m53s  (43.3%)17.6%  lr: 0.028442  loss: 0.008532  eta: 0h1m  tot: 0h0m53s  (43.5%)20.3%  lr: 0.028291  loss: 0.008545  eta: 0h1m  tot: 0h0m54s  (44.1%)20.7%  lr: 0.028291  loss: 0.008559  eta: 0h1m  tot: 0h0m54s  (44.1%)23.0%  lr: 0.027940  loss

Epoch: 100.0%  lr: -0.000000  loss: 0.006409  eta: <1min   tot: 0h2m1s  (100.0%)%  lr: 0.009950  loss: 0.007157  eta: <1min   tot: 0h1m38s  (80.4%)5.8%  lr: 0.009698  loss: 0.007062  eta: <1min   tot: 0h1m39s  (81.2%)7.7%  lr: 0.009598  loss: 0.007146  eta: <1min   tot: 0h1m40s  (81.5%)8.1%  lr: 0.009548  loss: 0.007114  eta: <1min   tot: 0h1m40s  (81.6%)12.6%  lr: 0.009146  loss: 0.007026  eta: <1min   tot: 0h1m41s  (82.5%)13.1%  lr: 0.009095  loss: 0.007048  eta: <1min   tot: 0h1m41s  (82.6%)15.8%  lr: 0.008894  loss: 0.006961  eta: <1min   tot: 0h1m42s  (83.2%)16.7%  lr: 0.008794  loss: 0.006955  eta: <1min   tot: 0h1m42s  (83.3%)17.6%  lr: 0.008643  loss: 0.007037  eta: <1min   tot: 0h1m42s  (83.5%)19.4%  lr: 0.008543  loss: 0.006901  eta: <1min   tot: 0h1m42s  (83.9%)20.3%  lr: 0.008442  loss: 0.006849  eta: <1min   tot: 0h1m43s  (84.1%)28.8%  lr: 0.007236  loss: 0.006975  eta: <1min   tot: 0h1m45s  (85.8%)34.7%  lr: 0.006784  loss: 0.006893  eta: <1min   tot: 0h1m46s  (86.9%)35.6

In [13]:
fh = open('modelSave.tsv', 'r')
content = fh.read()
content = content.strip().replace('\t', '\n').split('\n')

In [46]:
embeddings = dict()
for n, i in enumerate(content):
    if (not i.startswith('__label__')) & (i != ''):
        try:
            float(i)
        except:
            embeddings[i] = content[n+1:n+101]