# Introduction to Data
We first import our data and drop the unique ID columns

In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
df = pd.read_csv('data/train.csv')
df = df.drop(["uuid", "uuid_1"], axis=1)
df.head()

Unnamed: 0,name,short_description,category_list,category_groups_list,description
0,Toolex Machine,"Manufacturing & supplying Lathe, Boring,","Information Technology,Manufacturing","Information Technology,Manufacturing",Toolex Machine Tools Pvt Ltd Commenced in the ...
1,Perish the Thought Golf,Perish The Thought Golf is a company that deve...,Sports,Sports,Perish The Thought Golf is a company that deve...
2,Sell One Thing,A One Page Product Checkout Page,E-Commerce,Commerce and Shopping,Sell One Thing helps people sell stuff online....
3,Citeulike,Citeulike is a free online service to organize...,"Education,Internet","Education,Internet Services",Citeulike is a free online service to organize...
4,Juick,Juick is a microblogging website dedicated to ...,"Blogging Platforms,Information Technology,Mess...","Content and Publishing,Information Technology,...",IM-based social network and microblogging serv...


In [31]:
for i in range(0,9):
    print(i)

0
1
2
3
4
5
6
7
8


In [28]:
for i in range(0,2):
    print(df[i:i+1])

             name                         short_description  \
0  Toolex Machine  Manufacturing & supplying Lathe, Boring,   

                          category_list                  category_groups_list  \
0  Information Technology,Manufacturing  Information Technology,Manufacturing   

                                         description  
0  Toolex Machine Tools Pvt Ltd Commenced in the ...  
                      name                                  short_description  \
1  Perish the Thought Golf  Perish The Thought Golf is a company that deve...   

  category_list category_groups_list  \
1        Sports               Sports   

                                         description  
1  Perish The Thought Golf is a company that deve...  


In [30]:
len(df[655217:655219]['description'].values[0])


140

In [33]:
df[655217:655220]['name'].values[0]


'Avenir Telecom'

In [2]:
df.head()
df['short_description'] = df['short_description'].values.astype(str)
df.short_description.values

array(['Manufacturing & supplying Lathe, Boring,',
       'Perish The Thought Golf is a company that develops sports oriented applications and softwares for iPhon',
       'A One Page Product Checkout Page', ...,
       'Randstad is a global solutions provider in staffing and recruitment consultancy and placing talent in permanent and temporary job positions.',
       'Freemarket. The smaller company operates an online content marketplace.',
       'Avenir Telecom is a leading force within the mobile phone industry in Europe and offers 25 years of expertise & experience to its customers.'],
      dtype=object)

In [53]:
from Data.config_data import *

data = config_data().run()

In [51]:
sentence = []
for i in range(0, len(data)):
    sentence.append(data[i][1])
sentence[-1]

'ASSIA, Inc provides broadband and Wi-Fi performance monitoring, management and optimization software to internet service providers (ISP), communications regulators, and wholesalers so they can improve'

In [52]:
data[-1]

['ASSIA',
 'ASSIA, Inc provides broadband and Wi-Fi performance monitoring, management and optimization software to internet service providers (ISP), communications regulators, and wholesalers so they can improve']

## Labels

Let's look at how many companies are in each unique category

In [4]:
df['category_groups_list'].value_counts()

Health Care                                                                                          18855
Financial Services                                                                                   15815
Software                                                                                             12806
Real Estate                                                                                          12166
Financial Services,Lending and Investments                                                           10576
                                                                                                     ...  
Apps,Commerce and Shopping,Community and Lifestyle,Events,Media and Entertainment,Software               1
Data and Analytics,Education,Gaming,Mobile                                                               1
Apps,Data and Analytics,Design,Education,Information Technology,Mobile,Software                          1
Commerce and Shopping,Consumer Goods,

## Data Preparation
### Encoding the labels

In [4]:
labels = df.category_groups_list.unique()
label_dict = {}
for idx, label in enumerate(labels):
    label_dict[label] = idx
_col = df['category_groups_list']
_col = _col.apply(lambda x: label_dict.get(x, x))
df["label"] = _col

### Train and validation split

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df.index.values, 
                                                 df.label.values, 
                                                 test_size = 0.15,
                                                 random_state = 42,
                                                 )

In [11]:
df.index.values

array([     0,      1,      2, ..., 655215, 655216, 655217], dtype=int64)

In [6]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[x_train, 'data_type'] = 'train'
df.loc[x_val, 'data_type'] = 'val'

In [7]:
df.groupby(['category_groups_list', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name,short_description,category_list,description
category_groups_list,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Administrative Services,256,train,999,999,999,999
Administrative Services,256,val,203,203,203,203
"Administrative Services,Advertising,Agriculture and Farming,Consumer Electronics,Energy,Events,Media and Entertainment,Natural Resources,Professional Services,Real Estate,Sales and Marketing,Sustainability,Transportation",47575,train,1,1,1,1
"Administrative Services,Advertising,Apps,Artificial Intelligence,Data and Analytics,Health Care,Mobile,Sales and Marketing,Science and Engineering,Software",46443,train,1,1,1,1
"Administrative Services,Advertising,Apps,Community and Lifestyle,Information Technology,Mobile,Sales and Marketing,Software",49246,train,1,1,1,1
...,...,...,...,...,...,...
Transportation,121,val,1029,1029,1029,1029
"Transportation,Travel and Tourism",1535,train,408,408,408,408
"Transportation,Travel and Tourism",1535,val,63,63,63,63
Travel and Tourism,8,train,4040,4040,4040,4040


# Import packages to do analysis

In [8]:
%load_ext autoreload
%autoreload 2

#from Model.model import *


In [9]:
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

from Settings import settings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [10]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].short_description.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True,  
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].short_description.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    return_tensors='pt'
)



In [12]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [100]:
len(dataset_train), len(dataset_val)

(556935, 98283)

In [13]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 100

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)



In [16]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [21]:
import random
import numpy as np
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [22]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [23]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    tqdm.write(f'Validation loss: {val_loss}')

    

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/5570 [00:00<?, ?it/s]

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:76] data. DefaultCPUAllocator: not enough memory: you tried to allocate 56217600 bytes.