In [1]:
import os
import csv
import torch
import argparse
import numpy as np
from mlp import mlp
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import InputExample, InputFeatures
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertModel
from transformers import glue_convert_examples_to_features as convert_examples_to_features

  from .autonotebook import tqdm as notebook_tqdm


In [41]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
config=BertConfig.from_pretrained('./model')
tokenizer=BertTokenizer.from_pretrained('./model')
model=BertModel.from_pretrained('./model',config=config)

Some weights of the model checkpoint at ./model were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
def create_examples(lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    del lines[0]
    for (i, line) in enumerate(lines):
        guid = "%s-%s" % (set_type, i)
        # label = int(line[1])
        # in available.csv ,text is put at col:1,and don't need to be replaced with YZYHUST
        # text_a = line[2].replace("YZYHUST", ',')
        text_a=line[1]
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=None, label=None))
    return examples

def Load_data(tokenizer,file_path):
    csv.field_size_limit(500 * 1024 * 1024)
    with open(file_path, 'r') as f:
        examples = create_examples(list(csv.reader(f)), 'predict')
    label_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=label_list,
        max_length=256,
        output_mode="classification",
    )
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids)
    return dataset

In [43]:
file_path='./url/ip/ip_test.csv'
pred_dataloader = Load_data(tokenizer,file_path=file_path)
file=pd.read_csv(file_path)
label=file['label']
label.to_csv('./url/ip/test_label.csv',index=None)



In [55]:
a=torch.randn((3,2))
a=a.unsqueeze(0)
a.shape

torch.Size([1, 3, 2])

In [58]:
feature_list=[]
batch=pred_dataloader[0]
model.eval()
batch = tuple(t.to(device) for t in batch)

with torch.no_grad():
    inputs = {
        'input_ids': batch[0].unsqueeze(0),
        'attention_mask': batch[1].unsqueeze(0),
        'token_type_ids':batch[2].unsqueeze(0)
    }
    _,pool_outputs = model(**inputs,return_dict=False)
    print(pool_outputs.squeeze().shape)
    feature_list.append(pool_outputs)
features=torch.concat(feature_list,dim=0)
torch.save(features,'./url/ip/features_test.pt')

torch.Size([768])


# 层次分类器

In [104]:
import joblib
class hierarchy_cls():
    def __init__(self,use_ip=False) -> None:
        if use_ip==False:
            self.mlp=mlp(in_features=775)
            st=torch.load('./classifier_model/best_mlp.pkl')
            self.mlp.load_state_dict(st)
            self.LR24=joblib.load('./classifier_model/LR24.pkl')
            self.LR26=joblib.load('./classifier_model/LR26.pkl')
            self.LR48=joblib.load('./classifier_model/LR48.pkl')
        else:
            self.mlp=mlp(in_features=778)
            st=torch.load('./classifier_model/best_mlp_ip.pkl')
            self.mlp.load_state_dict(st)
            self.LR24=joblib.load('./classifier_model/LR24_ip.pkl')
            self.LR26=joblib.load('./classifier_model/LR26_ip.pkl')
            self.LR48=joblib.load('./classifier_model/LR48_ip.pkl')
    def predict(self,x):
        output=first_division=(torch.argmax(self.mlp(x),dim=-1)).numpy()
        idx_gp2=first_division==2
        # print(x.shape)
        features_2=x[idx_gp2]

        if features_2.shape[0]:
            second_division=self.LR24.predict(features_2)
            idx_gp26=second_division==2
            idx_gp48=second_division==4
            ft_26=features_2[idx_gp26]
            ft_48=features_2[idx_gp48]
            gp2=output[idx_gp2]
            if ft_26.shape[0]:
                pred_26=self.LR26.predict(ft_26)
                gp2[idx_gp26]=pred_26
            if ft_48.shape[0]:
                pred_48=self.LR48.predict(ft_48)
                gp2[idx_gp48]=pred_48
        
            output[idx_gp2]=gp2

        return output


### 拼接特征(text+URL+IP)，进行最后的预测

In [94]:
# dataset used to preprocess data
#TODO:use or not use ip feature
from torch.utils.data import Dataset
class url_data(Dataset):
    def __init__(self,file_name) -> None:
        super().__init__()
        self.file=pd.read_csv(file_name)
        self.text_data=Load_data(tokenizer,file_path=file_name)
    def __getitem__(self, index):
        batch=self.text_data[index]
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0].unsqueeze(0),
                'attention_mask': batch[1].unsqueeze(0),
                'token_type_ids':batch[2].unsqueeze(0)
            }
            _,pool_outputs = model(**inputs,return_dict=False)
            text_feature=pool_outputs.squeeze()
        ip_feature=torch.tensor(self.file.iloc[index,3:6].to_numpy(dtype=float))
        url_feature=torch.tensor(self.file.iloc[index,6:].to_numpy(dtype=float))
        feature=torch.concat([text_feature,url_feature,ip_feature],dim=-1)
        url=self.file.iloc[index,0]
        return url,feature
    def __len__(self):
        return len(self.file)
        

In [95]:
test=url_data(file_name='./test/ip_encode.csv')



In [123]:
data=DataLoader(test,batch_size=32)
cls=hierarchy_cls(use_ip=True)
url,ft=test[0]
url_col=[]
label=[]
for url,ft in tqdm(data):
    url_col+=list(url)
    pred=cls.predict(torch.tensor(ft.numpy(),dtype=torch.float))
    label+=list(pred)

100%|██████████| 303/303 [51:53<00:00, 10.27s/it]


In [124]:
url_csv=pd.Series(url_col)
label_csv=pd.Series(label)
prediction=pd.concat([url_csv,label_csv],axis=1)
prediction.columns=['url','label']
prediction.to_csv('./final_text_ip_prediction.csv',index=None)

In [125]:
fp=pd.read_csv('./final_text_ip_prediction.csv')
fp['label'].value_counts()

Unnamed: 0,url,label
0,02f.sxdljx.cn,9
1,03nlwcfs.uydszml.cn,2
2,007svu3i.gmqxisj.cn,2
3,054msgo1.zjx1314.top,6
4,010544.com,6
