In [1]:
import requests
import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments,BertForSequenceClassification,BertTokenizerFast, GPT2TokenizerFast,GPT2Tokenizer,GPT2LMHeadModel

2023-06-18 10:06:57.605361: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# Link for dataset download.
# https://github.com/commonsense/conceptnet5/wiki/Downloads

# DATA IS TO HEAVY WE SHOULD LOAD IT OURSELVES

In [3]:
# !wget https://s3.amazonaws.com/conceptnet/downloads/2019/edges/conceptnet-assertions-5.7.0.csv.gz

In [4]:
# !gzip -d 'conceptnet-assertions-5.7.0.csv.gz'

In [5]:
# s=0
# with open('conceptnet-assertions-5.7.0.csv') as f:
#     for line in f:
#         s+=1
#     if s%1000000==0:
#         print(s)


In [6]:
def read_english(cnt,nrows=1000000,types=['RelatedTo','IsA','ObstructedBy',
    'HasProperty','HasPrerequisite','Causes','UsedFor','HasA','MadeOf',
    'CreatedBy','AtLocation']):
    data=pd.read_csv('conceptnet-assertions-5.7.0.csv',sep='\t', \
    skiprows=cnt*nrows,nrows=nrows,names=[0,1,2,3,4])

    data=data[(data[2].apply(lambda x: '/c/en/' in x)) & \
     (data[3].apply(lambda x: '/c/en/' in x))]

    data['t']=False

    for t in types:
            data['t']=(data[1].apply(lambda x:t in x)) | data['t']

    data=data[data['t']]
    data=data.drop(['t'], axis=1)
    if data.shape[0]>0:
        print(data.shape[0])
    data=data[[1,2,3]]
    data[1]=data[1].apply(lambda x:x.replace('/r/',''))
    data[2]=data[2].apply(lambda x:x.replace('/c/en/',''))
    data[3]=data[3].apply(lambda x:x.replace('/c/en/',''))
    data[2]=data[2].apply(lambda x:x.lower())
    data[3]=data[3].apply(lambda x:x.lower())
    return data
def read_all():
    dfs=[]
    for cnt in tqdm.tqdm(range(35)):
        dfs.append(read_english(cnt))
    dfs=pd.concat(dfs,axis=0)
    dfs=dfs.sort_values(by=[2])
    dfs=dfs.reset_index(drop=True)
    return dfs

In [7]:
data=read_all()

100%|██████████| 35/35 [09:50<00:00, 16.89s/it]


49549
32075
5545
31143
231009
241815
950692
511075
17524
22266


In [8]:
max_length=5
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2',max_length=max_length,add_prefix_space=True)
vocab = tokenizer.get_vocab()
for token in data[1].unique():
  tokenizer.add_tokens([f"<{token}>"])
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
num_added_tokens=data[1].unique().shape[0]+1
v_max=len(tokenizer)
print(v_max)

50271


In [9]:
def find_in_db(word):
    left=data[2].searchsorted(word,'left')
    right=data[2].searchsorted(word,'right')
    sel=data.iloc[left:right]
    return sel

In [10]:
def find_in_db_2(word):
    sel=find_in_db(word)
    sel=sel[sel[3].apply(lambda x: ('Ġ'+x in vocab))]
    return sel

In [11]:
df=[]
for word in tqdm.tqdm(vocab.keys()):
    df.append(find_in_db_2(word))
df=pd.concat(df,axis=0)

100%|██████████| 50257/50257 [00:41<00:00, 1199.49it/s]


In [12]:
df

Unnamed: 0,1,2,3
934432,RelatedTo,info,information
934433,EtymologicallyRelatedTo,info,information
115595,RelatedTo,at,rate
115596,RelatedTo,at,regarding
115597,RelatedTo,at,shift
...,...,...,...
1561599,RelatedTo,represent,gift
1561600,RelatedTo,represent,behalf
1561601,RelatedTo,represent,again
1561603,RelatedTo,represent,act


In [13]:
df = df.sample(frac=1)
train=df.iloc[:int(df.shape[0]*0.8)]
test=df.iloc[int(df.shape[0]*0.8):]

In [14]:
class KnowledgeDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_=row[2]+f" <{row[1]}> "+row[3]
        input_ids=torch.tensor(tokenizer.encode(input_,padding='max_length',max_length=max_length,truncation=True))
        # mask_s=torch.where(input_ids>=v_max-num_added_tokens,1,0).nonzero()[0].item()
        mask1=torch.where(input_ids!=v_max-1,1,0)
        # l=int(mask1.sum().item())
        # mask2=torch.tensor([float(i>=mask_s) for i in range(max_length)])
        # attention_mask=mask1*mask2
        # attention_mask[l-1]=0
        return {'input_ids':input_ids,
         'attention_mask': mask1}




In [15]:
train_p=KnowledgeDataset(train)
test_p=KnowledgeDataset(test)

In [16]:
train_p[1]

{'input_ids': tensor([ 4656,   220, 50259,  6427, 50270]),
 'attention_mask': tensor([1, 1, 1, 1, 0])}

In [17]:
def create_mini_batch(samples):
    input_ids = [s['input_ids'] for s in samples]
    attention_mask = [(s['attention_mask']) for s in samples]
    l=max_length
    input_ids=torch.stack(input_ids)[:,:l]
    attention_mask=torch.stack(attention_mask)[:,:l]

    return {'input_ids':input_ids, 'attention_mask':attention_mask}

In [18]:
batch_size_gpt2 = 128
train_loader = DataLoader(train_p, batch_size=batch_size_gpt2, shuffle=True,collate_fn=create_mini_batch)
test_loader = DataLoader(test_p, batch_size=batch_size_gpt2,collate_fn=create_mini_batch)

In [19]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device='cpu'
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50271, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [67]:
def add_response_full(input_,num=8 ):
    input_ids=torch.tensor(tokenizer.encode(input_,padding='max_length',max_length=max_length,truncation=True)).to(device)
    # mask_s=torch.where(input_ids>=v_max-num_added_tokens,1,0).nonzero()[0].item()
    mask1=torch.where(input_ids!=v_max-1,1,0).to(device)
    # l=int(mask1.sum().item())
    # mask2=torch.tensor([int(i>=mask_s) for i in range(max_length)]).to(device)
    # attention_mask=mask1*mask2
    # attention_mask[l-1]=0
    # attention_mask=attention_mask.to(device)
    outputs = model(input_ids=input_ids, attention_mask=mask1)
    outputs= outputs.logits[mask1.sum().item()-1]
    sorted, indices = torch.sort(outputs,descending=True)
    choice=np.random.choice(np.arange(0,num), p=F.softmax(sorted[:num].type(torch.FloatTensor), dim=-1).detach().numpy())
    print(tokenizer.convert_ids_to_tokens(torch.topk(outputs, num)[1]))
    # print(choice)

    outputs=tokenizer.convert_ids_to_tokens([indices[choice]])[0]
    new_token=outputs.replace('Ġ','')
    input_=input_+' '+new_token
    return input_
def generate_response(input_=' hello <IsA>',l=1,num=1):
    for _ in range(l):
        # input_=add_response_old(input_)
        # input_=add_response(input_)
        input_=add_response_full(input_,num)

    return input_

In [21]:
def eval():
    with torch.no_grad():
        cnt=0
        loss=0
        for batch in tqdm.tqdm(test_loader):
            # input_ids,attention_mask,labels=batch
            input_ids=batch['input_ids']
            attention_mask=batch['attention_mask']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,labels=input_ids)
            loss+=outputs.loss.item()
            cnt+=1
        loss=loss/cnt
        print(generate_response(" man <HasA>",1,1))
        return loss

In [22]:
eval()

100%|██████████| 88/88 [00:14<00:00,  6.12it/s]


 man <HasA> <CreatedBy>


43.61714714223688

In [23]:
def train(num_epochs=200):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    best_valid_loss = float('inf')
    loss_fn=nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        corrects=0.0
        total=0.0
        for batch in tqdm.tqdm(train_loader):
            input_ids=batch['input_ids']
            attention_mask=batch['attention_mask']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=input_ids)
            loss = outputs.loss
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        avg_valid_loss = eval()
        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), "connet.pt")

        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}")








In [24]:
train()

100%|██████████| 351/351 [02:30<00:00,  2.33it/s]
100%|██████████| 88/88 [00:14<00:00,  6.11it/s]
100%|██████████| 351/351 [02:32<00:00,  2.30it/s]
100%|██████████| 88/88 [00:14<00:00,  5.99it/s]
100%|██████████| 351/351 [02:32<00:00,  2.30it/s]
100%|██████████| 88/88 [00:14<00:00,  6.05it/s]
100%|██████████| 351/351 [02:32<00:00,  2.30it/s]
100%|██████████| 88/88 [00:14<00:00,  6.07it/s]
100%|██████████| 351/351 [02:32<00:00,  2.31it/s]
100%|██████████| 88/88 [00:14<00:00,  6.17it/s]
100%|██████████| 351/351 [02:32<00:00,  2.30it/s]
100%|██████████| 88/88 [00:14<00:00,  6.01it/s]
100%|██████████| 351/351 [02:32<00:00,  2.31it/s]
100%|██████████| 88/88 [00:14<00:00,  6.03it/s]
100%|██████████| 351/351 [02:32<00:00,  2.31it/s]
100%|██████████| 88/88 [00:14<00:00,  6.03it/s]
100%|██████████| 351/351 [02:31<00:00,  2.32it/s]
100%|██████████| 88/88 [00:14<00:00,  5.94it/s]
100%|██████████| 351/351 [02:31<00:00,  2.31it/s]
100%|██████████| 88/88 [00:14<00:00,  6.18it/s]
100%|██████████| 351

 man <HasA> person
Epoch 1/200: Train Loss: 5.2404, Valid Loss: 2.7086
 man <HasA> person
Epoch 2/200: Train Loss: 2.8330, Valid Loss: 2.2695
 man <HasA> person
Epoch 3/200: Train Loss: 2.5882, Valid Loss: 2.2030
 man <HasA> man
Epoch 4/200: Train Loss: 2.4610, Valid Loss: 2.1917
 man <HasA> man
Epoch 5/200: Train Loss: 2.3672, Valid Loss: 2.1855
 man <HasA> man
Epoch 6/200: Train Loss: 2.2805, Valid Loss: 2.1723
 man <HasA> man
Epoch 7/200: Train Loss: 2.2042, Valid Loss: 2.1552
 man <HasA> man
Epoch 8/200: Train Loss: 2.1290, Valid Loss: 2.1278
 man <HasA> man
Epoch 9/200: Train Loss: 2.0798, Valid Loss: 2.1109
 man <HasA> man
Epoch 10/200: Train Loss: 2.0487, Valid Loss: 2.1062
 man <HasA> man
Epoch 11/200: Train Loss: 2.0244, Valid Loss: 2.0958
 man <HasA> man
Epoch 12/200: Train Loss: 2.0076, Valid Loss: 2.0914
 man <HasA> man
Epoch 13/200: Train Loss: 1.9872, Valid Loss: 2.0858
 man <HasA> man
Epoch 14/200: Train Loss: 1.9688, Valid Loss: 2.0722
 man <HasA> man
Epoch 15/200: Trai

KeyboardInterrupt: 

In [50]:
torch.save(model.state_dict(), "connet_2.pt")

In [119]:
address="connet_t.pt"
model.load_state_dict(torch.load(address))

<All keys matched successfully>

In [120]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50271, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [80]:
generate_response(" school <CreatedBy>",1,5)

['Ġbuilding', 'Ġchildren', 'Ġclasses', 'Ġplace', 'Ġshelter']


' school <CreatedBy> children'

In [74]:
generate_response(" male <UsedFor>",1,5)

['Ġsex', 'Ġlove', 'Ġreproduction', 'Ġfun', 'Ġpleasure']


' male <UsedFor> reproduction'