In [1]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

In [2]:
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 384  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [3]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [4]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                #   load_model_path='pytorch_model.bin'
                 )

Special tokens added


In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [6]:
numerical_data = pd.read_csv('all_after_preprocessingStopwords.csv')

In [7]:
numerical_data['description'][0]

'beautiful bright spacious home heart lake community right highway 410 approximately 2450 space 670 basement 4 4 separate entrance garage basement full potential great rental income hardwood pot light california shutter throughout freshly painted open concept floor plan eat stainless steel appliance minute trinity common mall park heart lake conservation much stainless steel stove fridge dishwasher washer dryer water heater rental existing light hardwood throughout concrete pad leading front backyard fit upto 4 car 1 largest semi detached model'

In [8]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from transformers import BertTokenizer,BertModel,get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
def Data_preprocessing(df):
    scaler = MinMaxScaler()
    
    # Numerical Feature
    numerical_features = ["bedroom","bedroomAboveGrade","bedroomBelowGrade","bathroom", "bathroomTotal","bathroomPartial", 
                        "totalParkingSpaces", "storeys", "maintenanceFees",  'landSize']
    
    X_num = df[numerical_features]
    X_num = scaler.fit_transform(X_num)
    X_num = pd.DataFrame(X_num)
    df["longitude"] = df["longitude"] *0.01
    df["latitude"] = df["latitude"] *0.01
    df[numerical_features] = X_num
          
    # Boolean Feature
    boolean_features=['parkingAttachedGarage',
       'parkingUnderground', 'parkingInsideEntry', 'parkingSurfaced',
       'parkingOversize', 'parkingGravel', 'parkingGarage', 'parkingShared',
       'parkingDetachedGarage', 'parkingCarport', 'parkingInterlocked',
       'parkingVisitorParking','amenityClubhouse', 'amenityCarWash', 'amenityMusicRoom',
       'amenityStorageLocker', 'amenitySauna', 'amenityPartyRoom',
       'amenityRecreationCentre', 'amenityGuestSuite', 'amenityFurnished',
       'amenityLaundryFacility', 'amenityExerciseCentre',
       'amenityLaundryInSuite', 'amenitySecurity', 'amenityWhirlpool',
       'efinishWood', 'efinishBrick', 'efinishHardboard', 'efinishWoodsiding',
       'efinishLog', 'efinishMetal', 'efinishSteel', 'efinishStone',
       'efinishWoodshingles', 'efinishStucco', 'efinishSiding',
       'efinishConcrete', 'efinishShingles', 'efinishAluminumsiding',
       'efinishCedarshingles', 'efinishVinyl', 'efinishVinylsiding',
       'featurePetNotAllowed', 'AirportNearby',
       'GolfNearby', 'MarinaNearby', 'ShoppingNearby', 'WaterNearby',
       'WorshipPlaceNearby', 'RecreationNearby', 'PlaygroundNearby',
       'PublicTransitNearby', 'ParkNearby', 'SchoolsNearby', 'HospitalNearby',
       'HighwayNearby', 'SkiAreaNearby']
    
    # Category Feature
    cate_features = ['city', 'typeBuilding', 'title', 'styleAttach', 
                   'cooling',  'basementType', 'basementFinish',
                   'heatingType1', 'heatingType2', 'heatingEnergy1', 'heatingEnergy2', 
                   'featureLotSlope', 'featureDriveway', 'featureLotPositionType',
       'featureOutdoorAreaType', 'featureOutdoorLandscape',
       'featureAdditionalFacility']
    
    X_category=df[cate_features]
    for col in cate_features:
        X_category[col] = X_category[col].astype('category')
        X_category[col] = X_category[col].cat.codes
    df[cate_features] = X_category
    
    # Label Price
    price_range = []
    
    for price in df["price"]:
        if price < 5e5:
            price_range.append('tinny size apartment with shared public infrastructure')
        elif 5e5<=price < 15e5:
            price_range.append('bachelor apartment with limited private infrastructure')
        elif 15e5<=price < 25e5:
            price_range.append('family applicable apartments with standard community service and private infrastructure')
        elif 25e5<=price < 35e5:
            price_range.append('large and well furnished apartment at prosperous district')
        elif 35e5<=price < 80e5:
            price_range.append('superior apartment with upscale customization services at commerce center area')
        else:
            price_range.append('luxury villa with large private yard')
            
#     df = df.reset_index(drop=True)
    df['price_range'] = price_range
    df = df.dropna()
    return df, boolean_features, cate_features, numerical_features

In [9]:
from sklearn.model_selection import train_test_split
preprocessed_data, boolean_features, cate_features, numerical_features = Data_preprocessing(numerical_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_category[col] = X_category[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_category[col] = X_category[col].cat.codes


In [10]:
X_train, X_test = train_test_split(preprocessed_data, test_size=0.1, random_state=13) 
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [11]:
class Generate_dataset(Dataset):
    def __init__(self, data_df, tokenizer):
        self.MAX_SEQ_LEN = 128
        self.data = data_df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx): 
        if torch.is_tensor(idx):
            idx = idx.tolist()

        description = self.data.iloc[idx]['description']
        label_range =  self.data.iloc[idx]['price_range']
        description_final = SPECIAL_TOKENS['bos_token'] + label_range + SPECIAL_TOKENS['sep_token'] + description + SPECIAL_TOKENS['eos_token']
        encodings_dict = self.tokenizer(description_final,                                   
                           truncation=True, 
                           max_length=MAXLEN, 
                           padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [12]:
Train_dataset = Generate_dataset(X_train, tokenizer)
Test_dataset = Generate_dataset(X_test, tokenizer)

In [13]:
training_args = TrainingArguments(
    output_dir="GPT_models/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

In [14]:
#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=Train_dataset,
    eval_dataset=Test_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

Using amp fp16 backend
***** Running training *****
  Num examples = 9218
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 576
  nn.utils.clip_grad_norm_(


Epoch,Training Loss,Validation Loss
0,No log,0.912832
1,No log,0.845357
2,No log,0.820636
3,1.736200,0.814085


***** Running Evaluation *****
  Num examples = 1025
  Batch size = 4
Saving model checkpoint to GPT_models/checkpoint-144
Configuration saved in GPT_models/checkpoint-144\config.json
Model weights saved in GPT_models/checkpoint-144\pytorch_model.bin
tokenizer config file saved in GPT_models/checkpoint-144\tokenizer_config.json
Special tokens file saved in GPT_models/checkpoint-144\special_tokens_map.json
Deleting older checkpoint [GPT_models\checkpoint-432] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1025
  Batch size = 4
Saving model checkpoint to GPT_models/checkpoint-288
Configuration saved in GPT_models/checkpoint-288\config.json
Model weights saved in GPT_models/checkpoint-288\pytorch_model.bin
tokenizer config file saved in GPT_models/checkpoint-288\tokenizer_config.json
Special tokens file saved in GPT_models/checkpoint-288\special_tokens_map.json
Deleting older checkpoint [GPT_models\checkpoint-576] due to args.save_total_limit
***** Running Ev

In [16]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='GPT_models/pytorch_model.bin')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\yli62/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true

Special tokens added


loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\yli62/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50257,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50260,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "sep_token_id": 50261,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-gen

In [17]:
# if torch.cuda.is_available():
#     device = torch.device('cuda:0')
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
# else:
device = torch.device('cpu')

In [18]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [19]:
from tqdm import tqdm
title = 'superior apartment with upscale customization services at commerce center area'
prompt = SPECIAL_TOKENS['bos_token'] + title +SPECIAL_TOKENS['sep_token']

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

generated_sentences = []


for _ in tqdm(range(500)):
    sample_outputs = model.generate(generated, 
                                    do_sample=True,   
                                    min_length=50, 
                                    max_length=MAXLEN,
                                    top_k=30,                                 
                                    top_p=0.7,        
                                    temperature=0.9,
                                    repetition_penalty=2.0,
                                    num_return_sequences=1
                                    )

    for i, sample_output in enumerate(sample_outputs):
        text = tokenizer.decode(sample_output, skip_special_tokens=True)
        generated_sentences.append(text[len(title):])
#     a = len(title) + len(','.join(keywords))    
#     print("{}: {}\n\n".format(i+1,  text[a:]))

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [21:46<00:00,  2.61s/it]


In [20]:
with open('good_sentence.txt','w') as f:
    for line in generated_sentences:
        f.write(line+'\n')

In [23]:
from tqdm import tqdm
title = 'luxury villa with large private yard'
prompt = SPECIAL_TOKENS['bos_token'] + title +SPECIAL_TOKENS['sep_token']

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

generated_sentences = []


for _ in tqdm(range(500)):
    sample_outputs = model.generate(generated, 
                                    do_sample=True,   
                                    min_length=50, 
                                    max_length=MAXLEN,
                                    top_k=30,                                 
                                    top_p=0.7,        
                                    temperature=0.9,
                                    repetition_penalty=2.0,
                                    num_return_sequences=1
                                    )

    for i, sample_output in enumerate(sample_outputs):
        text = tokenizer.decode(sample_output, skip_special_tokens=True)
        generated_sentences.append(text[len(title):])
#     a = len(title) + len(','.join(keywords))    
#     print("{}: {}\n\n".format(i+1,  text[a:]))

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [19:10<00:00,  2.30s/it]


In [24]:
with open('exce_sentence.txt','w') as f:
    for line in generated_sentences:
        f.write(line+'\n')