In [109]:
!unzip "../input/home-depot-product-search-relevance/train.csv.zip"
!unzip "../input/home-depot-product-search-relevance/test.csv.zip"
!unzip "../input/home-depot-product-search-relevance/product_descriptions.csv.zip"

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
home_df = pd.read_csv("./train.csv",encoding='ISO-8859-1')
home_product_desc = pd.read_csv("./product_descriptions.csv")
home_test_df = pd.read_csv("./test.csv",encoding='ISO-8859-1')

<IPython.core.display.Javascript object>

In [None]:
home_df

In [None]:
home_df = pd.concat((home_df,home_test_df),axis=0,ignore_index=True)

In [None]:
home_df = pd.merge(home_df,home_product_desc,how = 'left',on='product_uid')

In [None]:
home_df['product_info'] = home_df['product_title']+" "+home_df['product_description']

In [None]:
import numpy as np
home_df['relevance'] = home_df['relevance'].replace(np.nan,int(-1))

In [None]:
home_df

In [None]:
test_home_df = home_df[home_df['relevance']==-1]

In [None]:
train_home_df = home_df[home_df['relevance']>0]

In [None]:
train_home_df = train_home_df.drop(['product_uid','product_title','product_description'],axis=1)

In [None]:
train_home_df['relevance'].unique()

In [None]:
!pip install tez

In [None]:
!pip install transformers

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from transformers import BertModel,BertTokenizer,get_linear_schedule_with_warmup,AdamW,AutoTokenizer,AutoModel
import tez
from torch.utils.data import Dataset, DataLoader
from tez.datasets import GenericDataset
from tez import Model
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from sklearn import metrics,model_selection,preprocessing
import torchvision
import os
import sys

In [None]:
train_home_df,valid_home_df = train_test_split(train_home_df , test_size = 0.26, stratify = train_home_df.relevance.values, random_state = 42)

In [None]:
train_home_df = train_home_df.reset_index(drop=True)
valid_home_df = valid_home_df.reset_index(drop=True)

In [None]:
class home_depot_Dataset(Dataset):
    def __init__(self,df,tokenizer,max_len = 256):
        self.search_term = df.search_term.values
        self.product_info = df.product_info.values
        self.target = df.relevance.values
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self,item):
        search_term = str(self.search_term[item])
        search_term = ' '.join(search_term.split())
        
        product_info = str(self.product_info[item])
        product_info = ' '.join(product_info.split())
        
        inputs = self.tokenizer.encode_plus(search_term,
                                            product_info,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            padding="max_length",
                                            return_token_type_ids=True,
                                            return_tensors="pt",truncation=True)
        
        input_ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        attention_mask = inputs["attention_mask"]
        
        return {
            
            "input_ids" : input_ids.squeeze(),
            "token_type_ids" : token_type_ids.squeeze(),
            "attention_mask" : attention_mask.squeeze(),
            "targets" : torch.tensor(self.target[item], dtype = torch.float)
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
train_dataset = home_depot_Dataset(train_home_df,tokenizer=tokenizer)
valid_dataset = home_depot_Dataset(valid_home_df,tokenizer=tokenizer)

In [None]:
data = train_dataset[1]
tokenizer.decode(data["input_ids"])

In [None]:
class home_depot__Model(Model):
    def __init__(self):
        super().__init__()
        self.base_model =  AutoModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(768,1)
        self.step_scheduler_after = "epoch"
        
    def monitor_metrics(self, outputs,targets):
        if targets is None:
            return {}
        outputs = torch.sigmoid(outputs).cpu().detach().numpy() >= 0.5
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.r2_score(targets, outputs)
        return {"r2_score": accuracy}
    
    def fetch_optimizer(self):
        model = self.base_model
        no_decay = ["bias","LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
        opt = optimizer
        return opt
    
    def fetch_scheduler(self):
        scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=len(self.train_loader)
        )
        return scheduler
            
        
        
    def forward(self,input_ids,token_type_ids = None,attention_mask = None,targets = None):
        _, o_2 = self.base_model(input_ids=input_ids, 
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids)
        b_o = self.dropout(o_2)
        output = self.out(b_o)
        if targets is not None:
                    # calculate loss here
            loss = nn.MSELoss()(output, targets.view(-1,1))
            metrics = self.monitor_metrics(output, targets)
            return output, loss, metrics


        return output, None, None

In [None]:
model = home_depot__Model()

In [None]:
from tez.callbacks import EarlyStopping
es = EarlyStopping(monitor="valid_loss",mode= "min" ,model_path="model.bin",patience=3)

model.fit(
    train_dataset=train_dataset,
    valid_dataset=valid_dataset,
    train_bs=16,
    valid_bs=8,
    device="cuda",
    epochs=5,
    callbacks= [es],
    fp16=True

)
model.save("model.bin")

In [None]:
test_dataset = home_depot_Dataset(test_home_df,tokenizer=tokenizer)

In [None]:
data = train_dataset[102]
tokenizer.decode(data["input_ids"])
data["targets"]

In [None]:
def predict_sentence(data,model):
   
    input_ids = data["input_ids"].unsqueeze(0)
    token_type_ids = data["token_type_ids"].unsqueeze(0)
    attention_mask = data["attention_mask"].unsqueeze(0)
    targets = data["targets"].unsqueeze(0)

    output,loss,metr = model(input_ids,attention_mask=attention_mask,
                            token_type_ids=token_type_ids,targets = targets)
    print(output)
    return out[0][0]

In [None]:
predict_sentence(data,model)

In [None]:
final_preds = None
for j in range(1):
    preds = model.predict(test_dataset, batch_size=256, n_jobs=-1, device="cuda")
    temp_preds = None
    for p in preds:
        if temp_preds is None:
            temp_preds = p
        else:
            temp_preds = np.vstack((temp_preds, p))
    if final_preds is None:
        final_preds = temp_preds
    else:
        final_preds += temp_preds