In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import string
import numpy as np
import pickle

In [2]:
punct = string.punctuation
df_r = pd.read_csv("./raw_data/hollow_train.csv",header=None)

In [3]:
df_r.head()

Unnamed: 0,0,1,2,3
0,18,1,SUPPORT COMPREHENSIVE SUPPORT Investment Accou...,2018-09-28 04:37:13
1,18,1,"""""""Property investment management software bui...",2018-09-28 04:37:26
2,18,1,Of late we have seen some tentative steps take...,2018-09-28 04:36:33
3,18,1,Create a Rental Property Analysis at Your Fing...,2018-09-28 04:34:46
4,18,1,Guide to Personal Investment Software The sele...,2018-09-28 04:38:59


### Encode the labels between 0 to n_class-1

In [5]:
df = pd.DataFrame(columns=["extracted_content","topic_id"])
df['extracted_content'] = df_r[2]
df['topic_id'] = df_r[1]
label_encoder = list(np.unique(np.array(df_r[1])))
pickle.dump(label_encoder,open("./raw_data/hollow_label_encoder",'wb'))
l = len(label_encoder)
print(l)

5842


In [6]:
def encode_label(df):
    df['extracted_content'] = df['extracted_content'].apply(lambda x: " ".join(x.lower() for x in x.split() if x not in punct))
    df['topic_id'] = df['topic_id'].apply(lambda x: label_encoder.index(x))
    return df

In [7]:
df = encode_label(df)

In [8]:
df.head()

Unnamed: 0,extracted_content,topic_id
0,support comprehensive support investment accou...,0
1,"""""""property investment management software bui...",0
2,of late we have seen some tentative steps take...,0
3,create a rental property analysis at your fing...,0
4,guide to personal investment software the sele...,0


### Split the dataset as train and validation

In [9]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
df_train = pd.DataFrame(columns=['extracted_content','topic_id'])
df_val = pd.DataFrame(columns=['extracted_content','topic_id'])
for train_index, test_index in sss.split(df['extracted_content'], df['topic_id']):
    df_train = df.iloc[train_index]
    df_val = df.iloc[test_index]

In [11]:
df_train.head(),df_train.shape

(                                        extracted_content  topic_id
 291418  according to a new market intelligence report ...      5423
 249896  alstom is a player in substation automations s...      4636
 191857  fedex supports young entrepreneurship a team o...      3564
 3355    material handling from logistics centers to wa...        61
 164672  """it is known as china’s silicon valley and i...      3061,
 (235150, 2))

In [12]:
df_val.head(),df_val.shape

(                                        extracted_content  topic_id
 289270  gilat’s skyedge ii-c uses hts resources to del...      5382
 66747   this new initiative takes advantage of the gro...      1231
 297123  the 7 simple rules of using a clarisonic clean...      5531
 234384  as any business owner or senior hr executive w...      4339
 37563   business consulting services the business cons...       693,
 (78384, 2))

### Save the datasets

In [13]:
df_train.to_csv("./processed_data/df_train.csv",index=False)
df_val.to_csv("./processed_data/df_val.csv",index=False)

### Call define_process_hollow.py for further processing - mapping tokens to index

In [None]:
!python define_process_hollow.py

# How to train attention model

In [4]:
!python train_attention.py --help

usage: train_attention.py [-h] [--max_len MAX_LEN] [--bs BS]
                          [--n_vocab N_VOCAB] [--lr LR] [--n_epoch N_EPOCH]
                          [--do_train] [--do_eval] [--source_path SOURCE_PATH]
                          [--train_data TRAIN_DATA] [--val_data VAL_DATA]
                          [--token_id TOKEN_ID] [--emb_dim EMB_DIM]
                          [--save_path SAVE_PATH] [--model_load MODEL_LOAD]
                          --model_save MODEL_SAVE

optional arguments:
  -h, --help            show this help message and exit
  --max_len MAX_LEN     Maximum length of the sentence. default=500
  --bs BS               Batch size. default=32
  --n_vocab N_VOCAB     Vocabulary size. default=50000
  --lr LR               Learning rate. default=5e-4
  --n_epoch N_EPOCH     Number of epochs. default=30
  --do_train            Whether to run training. default=False
  --do_eval             Whether to run eval on the dev set. default=False
  --source

## Model evaluation

In [7]:
!python train_attention.py --do_eval --model_load "model_att5"

begin
number of classes 5842
model loaded
training begun
Attention(
  (embedding): Embedding(50002, 200, padding_idx=0)
  (gru): GRU(200, 50, batch_first=True, bidirectional=True)
  (sm): Softmax()
  (lin_bias): Linear(in_features=100, out_features=100, bias=True)
  (dp): Dropout(p=0.6)
  (linear1): Linear(in_features=100, out_features=500, bias=True)
  (linear2): Linear(in_features=500, out_features=1000, bias=True)
  (linear_out): Linear(in_features=1000, out_features=5842, bias=True)
)
time taken= 106.20869135856628
top1_acc 69.29092671973872
top5_acc 83.40732802612779
top10_acc 86.71284956113493


## Model training

In [10]:
!python train_attention.py --do_train --model_save "sample" --n_epoch 2

begin
number of classes 5842
model defined
training begun
Attention(
  (embedding): Embedding(50002, 200, padding_idx=0)
  (gru): GRU(200, 50, batch_first=True, bidirectional=True)
  (sm): Softmax()
  (lin_bias): Linear(in_features=100, out_features=100, bias=True)
  (dp): Dropout(p=0.6)
  (linear1): Linear(in_features=100, out_features=500, bias=True)
  (linear2): Linear(in_features=500, out_features=1000, bias=True)
  (linear_out): Linear(in_features=1000, out_features=5842, bias=True)
)
epoch 0 train loss = 8.213029274247997, accurancy = 0.004167552625983415 time = 468.70602774620056
epoch 0 validate loss = 6.638071433552238, accurancy = 0.05270208205756277 time = 104.84463810920715
epoch 1 train loss = 5.933590058394247, accurancy = 0.08071018498830533 time = 503.0820233821869
epoch 1 validate loss = 4.1879187500411135, accurancy = 0.3160849152888345 time = 108.19427800178528
Training completed. Best accuracy is 0.3160849152888345
