In [2]:
import pandas as pd
import random
import numpy as np
import torch
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(7)

In [3]:
numerical_data = pd.read_csv('all_after_preprocessingStopwords.csv')

In [4]:
len(numerical_data)

10251

In [5]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from transformers import BertTokenizer,BertModel,get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
def Data_preprocessing(df):
    scaler = MinMaxScaler()
    
    # Numerical Feature
    numerical_features = ["bedroom","bedroomAboveGrade","bedroomBelowGrade","bathroom", "bathroomTotal","bathroomPartial", 
                        "totalParkingSpaces", "storeys", "maintenanceFees",  'landSize']
    
    X_num = df[numerical_features]
    X_num = scaler.fit_transform(X_num)
    X_num = pd.DataFrame(X_num)
    df["longitude"] = df["longitude"] *0.01
    df["latitude"] = df["latitude"] *0.01
    df[numerical_features] = X_num
          
    # Boolean Feature
    boolean_features=['parkingAttachedGarage',
       'parkingUnderground', 'parkingInsideEntry', 'parkingSurfaced',
       'parkingOversize', 'parkingGravel', 'parkingGarage', 'parkingShared',
       'parkingDetachedGarage', 'parkingCarport', 'parkingInterlocked',
       'parkingVisitorParking','amenityClubhouse', 'amenityCarWash', 'amenityMusicRoom',
       'amenityStorageLocker', 'amenitySauna', 'amenityPartyRoom',
       'amenityRecreationCentre', 'amenityGuestSuite', 'amenityFurnished',
       'amenityLaundryFacility', 'amenityExerciseCentre',
       'amenityLaundryInSuite', 'amenitySecurity', 'amenityWhirlpool',
       'efinishWood', 'efinishBrick', 'efinishHardboard', 'efinishWoodsiding',
       'efinishLog', 'efinishMetal', 'efinishSteel', 'efinishStone',
       'efinishWoodshingles', 'efinishStucco', 'efinishSiding',
       'efinishConcrete', 'efinishShingles', 'efinishAluminumsiding',
       'efinishCedarshingles', 'efinishVinyl', 'efinishVinylsiding',
       'featurePetNotAllowed', 'AirportNearby',
       'GolfNearby', 'MarinaNearby', 'ShoppingNearby', 'WaterNearby',
       'WorshipPlaceNearby', 'RecreationNearby', 'PlaygroundNearby',
       'PublicTransitNearby', 'ParkNearby', 'SchoolsNearby', 'HospitalNearby',
       'HighwayNearby', 'SkiAreaNearby']
    
    # Category Feature
    cate_features = ['city', 'typeBuilding', 'title', 'styleAttach', 
                   'cooling',  'basementType', 'basementFinish',
                   'heatingType1', 'heatingType2', 'heatingEnergy1', 'heatingEnergy2', 
                   'featureLotSlope', 'featureDriveway', 'featureLotPositionType',
       'featureOutdoorAreaType', 'featureOutdoorLandscape',
       'featureAdditionalFacility']
    
    X_category=df[cate_features]
    for col in cate_features:
        X_category[col] = X_category[col].astype('category')
        X_category[col] = X_category[col].cat.codes
    df[cate_features] = X_category
    
    # Label Price
    price_range = []
    
    for price in df["price"]:
        if price < 5e5:
            price_range.append(0)
        elif 5e5<=price < 15e5:
            price_range.append(1)
        elif 15e5<=price < 25e5:
            price_range.append(2)
        elif 25e5<=price < 35e5:
            price_range.append(3)
        elif 35e5<=price < 80e5:
            price_range.append(4)
        else:
            price_range.append(5)
            
#     df = df.reset_index(drop=True)
    df['price_range'] = price_range
    df = df.dropna()
    return df, boolean_features, cate_features, numerical_features

In [6]:
from sklearn.model_selection import train_test_split
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
preprocessed_data, boolean_features, cate_features, numerical_features = Data_preprocessing(numerical_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_category[col] = X_category[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_category[col] = X_category[col].cat.codes


In [7]:
cate_num = len(cate_features)
bool_num =  len(boolean_features)
num_num = len(numerical_features)
print('The number of categorical feature is : {}, boolean feature is : {}, numerical feature is : {}'.format(cate_num, bool_num, num_num))

The number of categorical feature is : 17, boolean feature is : 58, numerical feature is : 10


In [8]:
class Generate_dataset(Dataset):
    def __init__(self, data_df, tokenizer, boolean_features,cate_features, numerical_features):
        self.MAX_SEQ_LEN = 128
        self.data = data_df
        self.tokenizer = tokenizer
        self.boolean = boolean_features
        self.cate = cate_features
        self.numerical = numerical_features

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx): 
        if torch.is_tensor(idx):
            idx = idx.tolist()

        boolean_feature = torch.tensor(list(self.data.iloc[idx][self.boolean]))
        cate_feature = torch.tensor(list(self.data.iloc[idx][self.cate]))
        numerical_feature = torch.tensor(list(self.data.iloc[idx][self.numerical]))
        description = self.data.iloc[idx]['description']
        word_encode = torch.tensor(self.tokenizer.encode(text= description,max_length=self.MAX_SEQ_LEN,padding='max_length', truncation=True))
        label = torch.tensor(float(self.data.iloc[idx]['price']))
        label_range =  torch.tensor(int(self.data.iloc[idx]['price_range']))
        
        return {'description': word_encode, 'numerical_feature':numerical_feature, 'cate_feature':cate_feature, 'boolean_feature':boolean_feature, 'label':label, 'label_range':label_range}

In [9]:
all_dataset = Generate_dataset(preprocessed_data, tokenizer, boolean_features,cate_features, numerical_features)

In [10]:
all_dataloader = DataLoader(all_dataset, batch_size = 16)

In [11]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')
BERT_org = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
import math
class BERT_Predictor(torch.nn.Module):
    def __init__(self, pretrained_model, tokenizer, predictor_size = 1, multi_mode = None):
        super(BERT_Predictor, self).__init__()
        
        self.bert = pretrained_model
        self.PAD_INDEX = tokenizer.pad_token_id
    
        self.final_embed = 36
        
        if multi_mode is not None:
            self.cat_num = multi_mode['cate_num']
            self.bool_num = multi_mode['bool_num']
            self.num_num = multi_mode['num_num']
        
        self.cat_encoder = torch.nn.Linear(self.cat_num,self.final_embed)
        self.bool_encoder = torch.nn.Linear(self.bool_num,self.final_embed)
        self.num_encoder = torch.nn.Linear(self.num_num,self.final_embed)
        self.des_encoder =  torch.nn.Linear(768,self.final_embed)
        
        self.multihead_attn = torch.nn.MultiheadAttention(self.final_embed, 12, dropout = 0.2, batch_first = True)
        self.layer_norm = torch.nn.LayerNorm(self.final_embed)
        self.dropout = torch.nn.Dropout(0.2)
        
        self.predictor = torch.nn.ModuleList()
        for _ in range(predictor_size):
            self.predictor.append(torch.nn.Linear(self.final_embed,self.final_embed))
            self.predictor.append(torch.nn.ReLU())
            self.predictor.append(torch.nn.Dropout(0.2))
            
        self.prediction = torch.nn.Linear(self.final_embed, 6)


    def forward(self,input_ids, cat_feature = None, numerical_feature = None, bool_feature = None, extract_feature = False):
        attention_mask = (input_ids != self.PAD_INDEX).type(torch.uint8)

        outputs = self.bert(input_ids,attention_mask=attention_mask)
        final_output = outputs[0]
        
        cat_feature = cat_feature.float()
        numerical_feature = numerical_feature.float()
        bool_feature = bool_feature.float()
        
        
        cat_feature_final = self.cat_encoder(cat_feature).unsqueeze(1)
        numerical_feature_final = self.num_encoder(numerical_feature).unsqueeze(1)
        bool_feature_final = self.bool_encoder(bool_feature).unsqueeze(1)
        des_output = self.des_encoder(final_output)
        
        numerical = torch.cat((cat_feature_final, numerical_feature_final, bool_feature_final, des_output), 1)
        
        attn_output, _ = self.multihead_attn(numerical, numerical, numerical)
        attn_output  = self.dropout(attn_output)
        attn_output = attn_output
        attn_output = self.layer_norm(attn_output + numerical)
        
        if extract_feature:
            return attn_output
        
        pooled_output = torch.mean(attn_output, dim = 1)
        for layer in self.predictor:
            pooled_output = layer(pooled_output)

        prediction = self.prediction(pooled_output)
        return prediction

In [13]:
multi_mode = {'cate_num': cate_num, 'bool_num':bool_num, 'num_num':num_num}
Bert_regressor = BERT_Predictor(BERT_org, tokenizer,multi_mode = multi_mode).to(device)

In [14]:
Bert_regressor.load_state_dict(torch.load('Bert_self_attention.pt'))

<All keys matched successfully>

In [15]:
from tqdm import tqdm
def data_extraction(model, test_iter):
    data_embeddings = []
    data_labels = []
    with torch.no_grad():
        model.eval()
        for data in tqdm(test_iter):
            source = data['description'].to(device)
            target_range = data['label_range'].to(device).unsqueeze(-1)
            cat_feature = data['cate_feature'].to(device)
            numerical_feature = data[ 'numerical_feature'].to(device)
            bool_feature = data['boolean_feature'].to(device)
            embeddings = model(input_ids=source,cat_feature = cat_feature, numerical_feature = numerical_feature, bool_feature = bool_feature,extract_feature = True)
            data_embeddings.extend(embeddings.tolist())
            data_labels.extend(target_range.tolist())
    
    data_labels = list(np.array(data_labels).reshape(-1))
    return data_embeddings, data_labels

In [16]:
data_embeddings, data_labels = data_extraction(Bert_regressor, all_dataloader)

  boolean_feature = torch.tensor(list(self.data.iloc[idx][self.boolean]))
100%|████████████████████████████████████████████████████████████████████████████████| 641/641 [01:42<00:00,  6.23it/s]


In [17]:
np.shape(data_embeddings)

(10243, 131, 36)

In [24]:
data_embeddings = torch.tensor(data_embeddings)

  data_embeddings = torch.tensor(data_embeddings)


In [18]:
data_labels = torch.tensor(data_labels)

In [19]:
data_labels.shape

torch.Size([10243])

In [26]:
torch.save(data_embeddings, 'sample_embeddding.pt')

In [20]:
torch.save(data_labels, 'data_labels.pt')