This file is used to calculate the coutnerfactual review differences based on the new descriptions.

In [1]:
!pip install transformers
!pip install miceforest
!pip install sentence_transformers
!pip install accelerate


Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.5 MB/s[0m eta [36m0:00:0

In [2]:
import torch
import numpy as np
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
import json
import cv2
from tqdm.auto import tqdm
from accelerate import Accelerator


In [3]:
### functions ###

import random

def permutation_test(list_1, list_2, num_permutations):

  counter = 0

  real_diff = np.mean(list_1 - list_2)

  for i in range(num_permutations):

    added_list = list(list_1) +(list(list_2))

    # shufflig in place: https://stackoverflow.com/questions/47516428/cant-get-random-shuffle-to-work-python-random-shuffle
    random.shuffle(added_list)

    cut_off = 0.5*np.floor(len(added_list))
    first_part = added_list[:int(cut_off)]
    second_part = added_list[int(cut_off):]

    if np.abs(np.mean(first_part)-np.mean(second_part)) > real_diff:
      counter += 1

    #print(i)

  emp_p_val =  counter / num_permutations

  return emp_p_val


## should be runing for around 10-15 minutes on V100 GPU

def get_counterfactuals(model, dl, cont: True):

    """ This function computes the couterfactuals given the generated names
        cont = True if continuous model, if false categorical model is expected
    """

    device = "cuda"

    ids = []
    preds_ = []
    Ys = []

    count_idx = 1

    with torch.no_grad():
      for batch in dl:

        #print(count_idx/len(dl))
        count_idx += 1

        img = batch[0].to(device)
        name = torch.tensor(batch[1]).to(device)
        joint_des = torch.tensor(batch[2]).to(device)
        X = batch[3].to(device)
        if cont == True:
          Y = torch.tensor(batch[5])
        else:
          Y = [int(i+1) for i in np.argmax(batch[2], axis = 1)]

        pred = model(img, name, joint_des, X)

        ids.extend(batch[6])

        if cont == True:
          preds_.extend(pred.cpu().numpy() )
        else:
          cat_pred = [i+1 for i in np.argmax(pred.cpu().numpy()[0], axis = 1)]
          preds_.extend(cat_pred)

        if cont == True:
          Ys.extend(Y.cpu().numpy() )
        else:
          Ys.extend(Y)


    return Ys, preds_, ids





class Simple_Dataset(Dataset):

    ''' This class transforms the input data to a useable dataset, which can be passed
    to PyTorch's dataloader class in a very generic fashion. The collate function needs to
    actually define the variables here!'''

    def __init__(self, df):
        # also defining default value #

        # data as Pytorch tensors via the collate batches function
        self.data = np.array(df)

    def __getitem__(self, idx):
        return (self.data[idx])

    def __len__(self):
        return(self.data.shape[0])


In [4]:
# connecting to drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


Mounted at /content/gdrive


### Loading in the prepared DF for the dicriminator models and the DF containing the generated titles

In [5]:
## reading in tabuar data


airbnb_london_filtered_images_imp_var = pd.read_csv("/content/gdrive/My Drive/Thesis/London_Data/airbnb_london_filtered_images_counterfactual_prep.csv")


In [None]:
pred_disc_2_cont_real_name = pd.read_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/pred_disc_2_cont_real_name_all_2.csv")


I shall now check for a bias in the coutnerfactuals (as compared to the true titles) for proxy classes 1 & 2.
Note that even if the means are not different (statistically no bias), a bias towards the mean or too high/too small variances might be present which alter the results to be calculated below.
Thus, using the coutnerfactuals based on the true titles makes more sense than simply using the true review difference counts.

In [None]:
pred_disc_2_cont_real_name_proxy = pd.merge(pred_disc_2_cont_real_name, airbnb_london_filtered_images_imp_var[["id", "proxy"]], how = "inner", on = "id")
np.mean(pred_disc_2_cont_real_name_proxy.pred[pred_disc_2_cont_real_name_proxy.proxy != 3]    -   pred_disc_2_cont_real_name_proxy.Y[pred_disc_2_cont_real_name_proxy.proxy != 3])

## pred is slightly lower than true value!


-0.030215809788416138

In [None]:
gen_titles_summ_models = pd.read_csv("/content/gdrive/My Drive/Thesis/London_Data/gen_titles_summ_models.csv")

In [None]:
gen_titles_summ_models.head()

Unnamed: 0,id,gen_titles_distilbart,gen_titles_bart,gen_titles_pegasus
0,13913,Bright Double Bedroom in Finsbury Park,Lovely Double Bedroom in Finsbury Park London,Lovely double bedroom in Finsbury Park
1,17402,Superb 3-bed 2bath in Fitzrovia W1,Superb Fitzrovia 3-bed 2-bath w/,Modern Fitzrovia Apartment with Elevator
2,25123,Lovely double room in clean house,Room to let up to 6 months or more.,Large double room in Golders Green
3,36299,"3 Bed House near the river, Kew Gardens","3 Bed House by Thames River, Kew Gardens,",3 Bed House with garden close to Thames river
4,39387,Private bedsit room in quiet street,BEDSIT ROOM IN Euston W14,Private lockable room in bright flat


In [None]:
## merging both

airbnb_london_filtered_images_imp_var_titles = pd.merge(airbnb_london_filtered_images_imp_var, gen_titles_summ_models, how = "left", on  = "id" )

In [None]:
airbnb_london_filtered_images_imp_var_titles.head()

Unnamed: 0,number_of_reviews_ltm,room_type,reviews_per_month,has_amenity_Iron,Tower Hamlets,price,minimum_nights,has_amenity_Cooking basics,Enfield,Kensington and Chelsea,...,has_amenity_Essentials,has_amenity_Dryer,id,name,joint_description,proxy,review_diff,gen_titles_distilbart,gen_titles_bart,gen_titles_pegasus
0,5.0,1,0.18,1,0,49.0,1.0,1,0,0,...,1,1,13913,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,3,15.0,Bright Double Bedroom in Finsbury Park,Lovely Double Bedroom in Finsbury Park London,Lovely double bedroom in Finsbury Park
1,7.0,0,0.36,1,0,379.0,4.0,1,0,0,...,1,1,17402,Superb 3-Bed/2 Bath & Wifi: Trendy W1,You'll have a wonderful stay in this superb mo...,3,5.0,Superb 3-bed 2bath in Fitzrovia W1,Superb Fitzrovia 3-bed 2-bath w/,Modern Fitzrovia Apartment with Elevator
2,0.0,1,0.87,1,0,29.0,10.0,0,0,0,...,1,0,25123,Clean big Room in London (Room 1),Big room with double bed clean sheets clean to...,1,0.0,Lovely double room in clean house,Room to let up to 6 months or more.,Large double room in Golders Green
3,13.0,0,0.65,1,0,195.0,3.0,1,0,0,...,1,1,36299,Kew Gardens 3BR house in cul-de-sac,3 Bed House with garden close to Thames river ...,3,7.0,"3 Bed House near the river, Kew Gardens","3 Bed House by Thames River, Kew Gardens,",3 Bed House with garden close to Thames river
4,0.0,1,0.1,1,0,42.0,5.0,1,0,0,...,1,1,39387,Stylish bedsit in Notting Hill ish flat.,Private lockable bedsit room available within ...,1,0.0,Private bedsit room in quiet street,BEDSIT ROOM IN Euston W14,Private lockable room in bright flat


### Embedding generated titles into JSON-based vector database

Does not need to be repeated, hence commented out!

In [None]:

### loading in retrained SBERT
#model_save_name = 'sbert_tuned.pth'
#path ="/content/gdrive/My Drive/Thesis/Models/{}".format(model_save_name)

#text_model = torch.load(path)
#text_model = text_model.to("cuda")


#### For BART

In [None]:
#dict_titles_bart = {}


In [None]:
#for idx in range(gen_titles_summ_models.shape[0]):

#  print(idx/gen_titles_summ_models.shape[0])

#  id = gen_titles_summ_models.id[idx]

#  enc1 = text_model.encode(gen_titles_summ_models.gen_titles_bart[idx])
#  enc = enc1.tolist()

#  dict_titles_bart[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_bart)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_bart_encoded.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_bart)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_bart_encoded.json", 'w') as f:
#  f.write(x)



#### For DistilBART

In [None]:
#dict_titles_distilbart = {}


In [None]:
#for idx in range(gen_titles_summ_models.shape[0]):

#  print(idx/gen_titles_summ_models.shape[0])

#  id = gen_titles_summ_models.id[idx]

#  enc1 = text_model.encode(gen_titles_summ_models.gen_titles_distilbart[idx])
#  enc = enc1.tolist()

#  dict_titles_distilbart[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_distilbart)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_distilbart_encoded.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_distilbart)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_distilbart_encoded.json", 'w') as f:
#  f.write(x)



#### For PEGASUS

In [None]:
#dict_titles_pegasus = {}


In [None]:
#for idx in range(gen_titles_summ_models.shape[0]):

#  print(idx/gen_titles_summ_models.shape[0])

#  id = gen_titles_summ_models.id[idx]

#  enc1 = text_model.encode(gen_titles_summ_models.gen_titles_pegasus[idx])
#  enc = enc1.tolist()

#  dict_titles_pegasus[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_pegasus)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_pegasus_encoded.json", 'w') as f:
#     f.write(x)

#x = json.dumps(dict_titles_pegasus)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_pegasus_encoded.json", 'w') as f:
#  f.write(x)



### Llama-Genenration 1

#### For Llama (Low-Rank Adaptation)

In [None]:
#llama_lora = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama.csv")
#dict_titles_llama_lora = {}


In [None]:
#for idx in range(llama_lora.shape[0]):

#  print(idx/llama_lora.shape[0])

#  id = llama_lora.id[idx]

#  enc1 = text_model.encode(llama_lora.gen_title[idx])
#  enc = enc1.tolist()

#  dict_titles_llama_lora[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_llama_lora)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_lora_encoded.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_llama_lora)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_lora_encoded.json", 'w') as f:
#  f.write(x)



#### For Llama-Adapter

In [None]:
#llama_adapter = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_adapter.csv")
#dict_titles_llama_adapter = {}


In [None]:
#for idx in range(llama_adapter.shape[0]):

#  print(idx/llama_adapter.shape[0])

#  id = llama_adapter.id[idx]

#  enc1 = text_model.encode(llama_adapter.gen_titles[idx])
#  enc = enc1.tolist()

#  dict_titles_llama_adapter[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_llama_adapter)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_adapter_encoded.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_llama_adapter)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_adapter_encoded.json", 'w') as f:
#  f.write(x)



#### For Llama - no PEFT

In [None]:
#llama_no_peft = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_no_peft.csv")
#dict_titles_llama_no_peft = {}


In [None]:
#for idx in range(llama_no_peft.shape[0]):

#  print(idx/llama_no_peft.shape[0])

#  id = llama_no_peft.id[idx]

#  enc1 = text_model.encode(llama_no_peft.gen_title[idx])
#  enc = enc1.tolist()

#  dict_titles_llama_no_peft[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_llama_no_peft)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_no_peft_encoded.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_llama_no_peft)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_no_peft_encoded.json", 'w') as f:
#  f.write(x)



### Llama-Genenration 2

#### For Llama (Low-Rank Adaptation)

In [None]:
#llama_lora_2 = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_v2.csv")
#dict_titles_llama_lora_2 = {}


In [None]:
#for idx in range(llama_lora_2.shape[0]):

#  print(idx/llama_lora_2.shape[0])

#  id = llama_lora_2.id[idx]

#  enc1 = text_model.encode(llama_lora_2.gen_title[idx])
#  enc = enc1.tolist()

#  dict_titles_llama_lora_2[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_llama_lora_2)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_lora_encoded_2.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_llama_lora_2)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_lora_encoded_2.json", 'w') as f:
#  f.write(x)



#### For Llama-Adapter

In [None]:
#llama_adapter_2 = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_adapter_v2.csv")
#dict_titles_llama_adapter_2 = {}


In [None]:
#for idx in range(llama_adapter_2.shape[0]):

#  print(idx/llama_adapter_2.shape[0])

#  id = llama_adapter_2.id[idx]

#  enc1 = text_model.encode(llama_adapter_2.gen_titles[idx])
#  enc = enc1.tolist()

#  dict_titles_llama_adapter_2[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_llama_adapter_2)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_adapter_encoded_2.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_llama_adapter_2)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_adapter_encoded_2.json", 'w') as f:
#  f.write(x)



#### For Llama - no PEFT

In [None]:
#llama_no_peft_2 = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_no_peft_v2.csv")
#dict_titles_llama_no_peft_2 = {}


In [None]:
#for idx in range(llama_no_peft_2.shape[0]):

#  print(idx/llama_no_peft_2.shape[0])

#  id = llama_no_peft_2.id[idx]

#  enc1 = text_model.encode(llama_no_peft_2.gen_title[idx])
#  enc = enc1.tolist()

#  dict_titles_llama_no_peft_2[int(id)] = enc

#  if idx % 200 == 0:
#    x = json.dumps(dict_titles_llama_no_peft_2)
#    with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_no_peft_encoded_2.json", 'w') as f:
#      f.write(x)

#x = json.dumps(dict_titles_llama_no_peft_2)
#with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_no_peft_encoded_2.json", 'w') as f:
#  f.write(x)



## Loading in important stuff

Including the encoded names (the ones generated with the code just above) and encoded true names, descriptions and images


In [None]:
# reading in json vector data
# sometimes needs to be executed twice

import json


with open("/content/gdrive/My Drive/Thesis/London_Data/json_names.json") as json_data: ## not really needed here
    dict_names = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/London_Data/json_des.json") as json_data:
    dict_des = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/Image_data/json_images.json") as json_data:
    dict_images = json.load(json_data)


In [None]:
with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_distilbart_encoded.json") as json_data:
  dict_titles_distilbart = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_bart_encoded.json") as json_data:
  dict_titles_bart = json.load(json_data)


with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_pegasus_encoded.json") as json_data:
  dict_titles_pegasus = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_lora_encoded.json") as json_data:
  dict_titles_llama_lora = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_adapter_encoded.json") as json_data:
  dict_titles_llama_adapter = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_no_peft_encoded.json") as json_data:
  dict_titles_llama_no_peft = json.load(json_data)

In [None]:
with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_lora_encoded_2.json") as json_data:
  dict_titles_llama_lora_2 = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_adapter_encoded_2.json") as json_data:
  dict_titles_llama_adapter_2 = json.load(json_data)

with open("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/json_llama_no_peft_encoded_2.json") as json_data:
  dict_titles_llama_no_peft_2 = json.load(json_data)

In [None]:

import torch
from torch import nn
from torch import optim

hidden_dim_1 = 60
hidden_dim_2 = 60
hidden_dim_3 = 50
hidden_dim_4 = 20

output_dim = 1


## name, des / text model


class NeuralNetwork2(nn.Module): ## without extractor
    def __init__(self, tab_input_dim, final_output_dim_visual , final_output_dim_text_name, final_output_dim_text_des, hidden_dim_1, hidden_dim_2, hidden_dim_3, hidden_dim_4, output_dim):
        super(NeuralNetwork2, self).__init__()


        # layers build on top of visual model before concatination
        self.visual_add_layer_1 = nn.Linear(768, 100)
        self.visual_add_layer_2 = nn.Linear(100, 100)
        self.visual_add_layer_3 = nn.Linear(100, 50)
        self.visual_add_layer_4 = nn.Linear(50, final_output_dim_visual)

        # Layers added after Name/Title
        self.name_added_layer_1 = nn.Linear(768, 100)
        self.name_added_layer_2 = nn.Linear(100, 100)
        ### dropout ##
        self.name_added_layer_3 = nn.Linear(100, 50)
        self.name_added_layer_4 = nn.Linear(50, final_output_dim_text_name)

        # Layers added after Description
        self.des_added_layer_1 = nn.Linear(768, 100)
        self.des_added_layer_2 = nn.Linear(100, 100)
        ### dropout ##
        self.des_added_layer_3 = nn.Linear(100, 50)
        self.des_added_layer_4 = nn.Linear(50, final_output_dim_text_des)


        ### Layers added after Joint Description ###

        # dimension after concat operation
        self.concat_dim = tab_input_dim + final_output_dim_visual + final_output_dim_text_name + final_output_dim_text_des

        # layers processing input after concatination
        self.layer_1 = nn.Linear(self.concat_dim, hidden_dim_1)
        self.layer_2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.layer_3 = nn.Linear(hidden_dim_2, hidden_dim_3)
        self.layer_4 = nn.Linear(hidden_dim_3, hidden_dim_4)
        self.layer_5 = nn.Linear(hidden_dim_4, output_dim)

        # Regu features
        self.dropout = nn.Dropout(0.03)
        self.batch_norm_1 = nn.BatchNorm1d(hidden_dim_2) # 1-dim batch norm with covariance-shift params activated
        self.batch_norm_2 = nn.BatchNorm1d(hidden_dim_3)



    def forward(self, img_encoded, name_encoded, des_encoded, X):  ## name, des

        ## firstly processing outputs of visual and textual model ##

        # visual model
        output_img = torch.nn.functional.tanh(self.visual_add_layer_1(img_encoded))
        output_img = self.dropout(output_img)
        output_img = torch.nn.functional.tanh(self.visual_add_layer_2(output_img))
        output_img = torch.nn.functional.tanh(self.visual_add_layer_3(output_img))
        final_output_img = torch.nn.functional.tanh(self.visual_add_layer_4(output_img))

        # textual - description
        encoded_des = torch.tensor(des_encoded)
        encoded_des = torch.nn.functional.tanh(self.des_added_layer_1(encoded_des))
        encoded_des = self.dropout(encoded_des)
        encoded_des = torch.nn.functional.tanh(self.des_added_layer_2(encoded_des))
        encoded_des = self.dropout(encoded_des)
        encoded_des = torch.nn.functional.tanh(self.des_added_layer_3(encoded_des))
        final_encoded_des = torch.nn.functional.tanh(self.des_added_layer_4(encoded_des))

        # textual - Name
        encoded_name = torch.tensor(name_encoded)
        encoded_name = torch.nn.functional.tanh(self.name_added_layer_1(encoded_name))
        encoded_name = self.dropout(encoded_name)
        encoded_name = torch.nn.functional.tanh(self.name_added_layer_2(encoded_name))
        encoded_name = torch.nn.functional.tanh(self.name_added_layer_3(encoded_name))
        final_encoded_name = torch.nn.functional.relu(self.name_added_layer_4(encoded_name))

        # concatination #
        x = torch.cat((final_output_img, final_encoded_name, final_encoded_des, X), 1)

        ## processing of joint representation ##    --- CONV/FC -> BatchNorm -> ReLu(or other activation) -> Dropout
        x = torch.nn.functional.tanh(self.layer_1(x))
        x = self.layer_2(x)
        x = self.batch_norm_1(x)
        x = torch.nn.functional.tanh(self.dropout(x))
        x = self.layer_3(x)
        x = self.batch_norm_2(x)
        x = torch.nn.functional.tanh(self.dropout(x))
        x = torch.nn.functional.tanh(self.layer_4(x))
        x = self.layer_5(x)

        return x



In [None]:
model_save_name = "discriminator_sparse_2_cont_again.pth"

path = "/content/gdrive/My Drive/Thesis/Models/{}".format(model_save_name)
model_cont = torch.load(path) ## map_location=torch.device('cpu'))

## Setting up a dataloader and creating predictions for DistilBART


In [None]:
conterfac_dataset = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles)

In [None]:
for i,j in enumerate(airbnb_london_filtered_images_imp_var_titles.columns):
  print(f"At position {str(i)} : variable {str(j)}")

At position 0 : variable number_of_reviews_ltm
At position 1 : variable room_type
At position 2 : variable reviews_per_month
At position 3 : variable has_amenity_Iron
At position 4 : variable Tower Hamlets
At position 5 : variable price
At position 6 : variable minimum_nights
At position 7 : variable has_amenity_Cooking basics
At position 8 : variable Enfield
At position 9 : variable Kensington and Chelsea
At position 10 : variable Islington
At position 11 : variable Wandsworth
At position 12 : variable Southwark
At position 13 : variable has_amenity_Elevator
At position 14 : variable number_of_reviews
At position 15 : variable has_amenity_Kitchen
At position 16 : variable has_amenity_Hair dryer
At position 17 : variable Lambeth
At position 18 : variable bedrooms
At position 19 : variable has_amenity_Refrigerator
At position 20 : variable Newham
At position 21 : variable has_amenity_Heating
At position 22 : variable has_amenity_Hot water
At position 23 : variable len_description
At pos

In [None]:
def collate_batch_counterfactual_distilbart(batch):

  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_distilbart_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_distilbart_name.append(dict_titles_distilbart[str(data[62])])

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_distilbart_name = torch.tensor(list_distilbart_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_distilbart_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for DistilBART generated titles

In [None]:
dl_counterfactuals_distilbart = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_distilbart, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_distilbart, True)
preds = [a[0] for a in preds_]

  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_distilbart = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})
#cf_distilbart.to_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_distilbart.csv", index = False)

In [None]:
#cf_distilbart = pd.read_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_distilbart.csv")


In [None]:
df_all_preds_distilbart = pd.merge(airbnb_london_filtered_images_imp_var[["id", "proxy"]], cf_distilbart, how = "left", on = "id")
df_all_preds_distilbart["pred_real_name"] = pred_disc_2_cont_real_name.pred


#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:

np.mean(df_all_preds_distilbart[df_all_preds_distilbart.proxy != 3].pred - df_all_preds_distilbart[df_all_preds_distilbart.proxy != 3].pred_real_name)


-0.019353811836787346


In [None]:
permutation_test(df_all_preds_distilbart[df_all_preds_distilbart.proxy != 3].pred, df_all_preds_distilbart.pred_real_name[df_all_preds_distilbart.proxy != 3], 1000)


0.708


## Setting up a dataloader and creating predictions for BART


In [None]:
conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles)

In [None]:
for i,j in enumerate(airbnb_london_filtered_images_imp_var_titles.columns):
  print(f"At position {str(i)} : variable {str(j)}")

At position 0 : variable number_of_reviews_ltm
At position 1 : variable room_type
At position 2 : variable reviews_per_month
At position 3 : variable has_amenity_Iron
At position 4 : variable Tower Hamlets
At position 5 : variable price
At position 6 : variable minimum_nights
At position 7 : variable has_amenity_Cooking basics
At position 8 : variable Enfield
At position 9 : variable Kensington and Chelsea
At position 10 : variable Islington
At position 11 : variable Wandsworth
At position 12 : variable Southwark
At position 13 : variable has_amenity_Elevator
At position 14 : variable number_of_reviews
At position 15 : variable has_amenity_Kitchen
At position 16 : variable has_amenity_Hair dryer
At position 17 : variable Lambeth
At position 18 : variable bedrooms
At position 19 : variable has_amenity_Refrigerator
At position 20 : variable Newham
At position 21 : variable has_amenity_Heating
At position 22 : variable has_amenity_Hot water
At position 23 : variable len_description
At pos

In [None]:
def collate_batch_counterfactual_bart(batch):

  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_distilbart_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_distilbart_name.append(dict_titles_bart[str(data[62])])

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_distilbart_name = torch.tensor(list_distilbart_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_distilbart_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for BART generated titles




In [None]:
dl_counterfactuals_bart = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_bart, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_bart, True)
preds = [a[0] for a in preds_]

  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_bart = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})
#cf_bart.to_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_bart.csv", index = False)

In [None]:
df_all_preds_bart = pd.merge(airbnb_london_filtered_images_imp_var[["id", "proxy"]], cf_bart, how = "left", on = "id")
df_all_preds_bart["pred_real_name"] = pred_disc_2_cont_real_name.pred


#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:
###

np.mean(df_all_preds_bart[df_all_preds_bart.proxy != 3].pred - df_all_preds_bart[df_all_preds_bart.proxy != 3].pred_real_name)


-0.019314348749710336


In [None]:
permutation_test(df_all_preds_bart[df_all_preds_bart.proxy != 3].pred, df_all_preds_bart.pred_real_name[df_all_preds_bart.proxy != 3], 1000)


0.824


## Setting up a dataloader and creating predictions for Pegasus


In [None]:
conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles)

In [None]:
#for i,j in enumerate(airbnb_london_filtered_images_imp_var_titles.columns):
  #print(f"At position {str(i)} : variable {str(j)}") # so above!

In [None]:
def collate_batch_counterfactual_pegasus(batch):

  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_distilbart_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_distilbart_name.append(dict_titles_pegasus[str(data[62])])    # only change here!

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_distilbart_name = torch.tensor(list_distilbart_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_distilbart_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for PEGASUS generated titles

In [None]:
dl_counterfactuals_pegasus = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_pegasus, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_pegasus, True)
preds = [a[0] for a in preds_]

  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_pegasus = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})
#cf_pegasus.to_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_distilbart.csv", index = False)

In [None]:
df_all_preds_peg = pd.merge(airbnb_london_filtered_images_imp_var[["id", "proxy"]], cf_pegasus, how = "left", on = "id")
df_all_preds_peg["pred_real_name"] = pred_disc_2_cont_real_name.pred


#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:

np.mean(df_all_preds_peg[df_all_preds_peg.proxy != 3].pred - df_all_preds_peg[df_all_preds_peg.proxy != 3].pred_real_name)


-0.019454615010690904


In [None]:
permutation_test(df_all_preds_peg[df_all_preds_peg.proxy != 3].pred, df_all_preds_peg.pred_real_name[df_all_preds_peg.proxy != 3], 1000)


0.331

## LlaMa Generation 1

### LLama LORA

In [None]:
gen_titles_llama_lora = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama.csv")

In [None]:
# as I have fewer generated titles with LLaMa LLora
airbnb_london_filtered_images_imp_var_titles_with_llama_lora = pd.merge(airbnb_london_filtered_images_imp_var_titles,  gen_titles_llama_lora, how = "inner", on = "id")

conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles_with_llama_lora)


In [None]:
def collate_batch_counterfactual_llama_lora(batch):

  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_distilbart_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_distilbart_name.append(dict_titles_llama_lora[str(data[62])])    # only change here!

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_distilbart_name = torch.tensor(list_distilbart_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_distilbart_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for LLAMA (LoRa) generated titles

In [None]:
dl_counterfactuals_llama_lora = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_llama_lora, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_llama_lora, True)
preds = [a[0] for a in preds_]

  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_llama_lora = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})
#cf_llama_lora.to_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_distilbart.csv", index = False)

In [None]:
#cf_distilbart = pd.read_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_distilbart.csv")


In [None]:
df_all_preds_llama_lora = pd.merge(airbnb_london_filtered_images_imp_var_titles_with_llama_lora[["id", "proxy"]], cf_llama_lora, how = "left", on = "id")


In [None]:
df_all_preds_llama_lora = pd.merge(df_all_preds_llama_lora, pred_disc_2_cont_real_name[["id", "pred"]], on = "id", how = "inner")

#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:
###  ## x is is generated  tittle, y is true title


np.mean(df_all_preds_llama_lora[df_all_preds_llama_lora.proxy != 3].pred_x - df_all_preds_llama_lora[df_all_preds_llama_lora.proxy != 3].pred_y)


0.033025576656099606

Also checking result of permutation test based on proxy value

In [None]:
permutation_test(df_all_preds_llama_lora[df_all_preds_llama_lora.proxy != 3].pred_x, df_all_preds_llama_lora[df_all_preds_llama_lora.proxy != 3].pred_y, 1000)


0.103

### LLama-Adapter

In [None]:
gen_titles_llama_adapter = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_adapter.csv")


In [None]:
gen_titles_llama_adapter

Unnamed: 0,id,gen_titles
0,19443194,Explore this stunning two bed flat in Fulham
1,14071318,"Bright, spacious, central London flat"
2,23847732,Central 2 Bedroom Flat with a lovely terrace!
3,38260808,Cozy room in a welcoming home
4,9531664,Double close to Leyton ST/Leyton STC
...,...,...
995,6704485,Private room with ensuite in Clapham High St
996,53035936,Modern Self Contained Studio Nestled in Privat...
997,3305771,East London with private space
998,34339339,Lovely Size Double Room in Stratford


In [None]:
# as I have fewer generated titles with LLaMa LLora
airbnb_london_filtered_images_imp_var_titles_with_llama_adapter = pd.merge(airbnb_london_filtered_images_imp_var_titles,  gen_titles_llama_adapter, how = "inner", on = "id")
conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles_with_llama_adapter)


In [None]:
def collate_batch_counterfactual_llama_adapter(batch):


  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_llama_adapter_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_llama_adapter_name.append(dict_titles_llama_adapter[str(data[62])])    # only change here!

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_llama_adapter_name = torch.tensor(list_llama_adapter_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_llama_adapter_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for LLAMA-Adapter generated titles

In [None]:
dl_counterfactuals_llama_adapter = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_llama_adapter, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_llama_adapter, True)
preds = [a[0] for a in preds_]

  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_llama_adapter = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})
#cf_llama_adapter.to_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_distilbart.csv", index = False)

In [None]:
df_all_preds_llama_adapter = pd.merge(airbnb_london_filtered_images_imp_var_titles_with_llama_adapter[["id", "proxy"]], cf_llama_adapter, how = "left", on = "id")


In [None]:
df_all_preds_llama_adapter = pd.merge(df_all_preds_llama_adapter, pred_disc_2_cont_real_name[["id", "pred"]], on = "id", how = "inner")

#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:
###  ## x is is generated  tittle, y is true title

np.mean(df_all_preds_llama_adapter[df_all_preds_llama_adapter.proxy != 3].pred_x - df_all_preds_llama_adapter[df_all_preds_llama_adapter.proxy != 3].pred_y)


0.031121540145025196

In [None]:
permutation_test(df_all_preds_llama_adapter[df_all_preds_llama_adapter.proxy != 3].pred_x, df_all_preds_llama_adapter[df_all_preds_llama_adapter.proxy != 3].pred_y, 1000)


0.124

### LLama (no PEFT)

In [None]:
gen_titles_llama_no_peft = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_no_peft.csv")



In [None]:
# as I have fewer generated titles with LLaMa LLora
airbnb_london_filtered_images_imp_var_titles_with_llama_no_peft = pd.merge(airbnb_london_filtered_images_imp_var_titles,  gen_titles_llama_no_peft, how = "inner", on = "id")
conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles_with_llama_no_peft)


In [None]:
def collate_batch_counterfactual_llama_no_peft(batch):


  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_llama_adapter_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_llama_adapter_name.append(dict_titles_llama_no_peft[str(data[62])])    # only change here!

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_llama_adapter_name = torch.tensor(list_llama_adapter_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_llama_adapter_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for LLAMA generated titles (no peft)

In [None]:
dl_counterfactuals_llama_no_peft = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_llama_no_peft, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_llama_no_peft, True)
preds = [a[0] for a in preds_]

  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_llama_no_peft = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})


In [None]:
df_all_preds_llama_no_peft = pd.merge(airbnb_london_filtered_images_imp_var_titles_with_llama_no_peft[["id", "proxy"]], cf_llama_no_peft, how = "left", on = "id")


In [None]:
df_all_preds_llama_no_peft = pd.merge(df_all_preds_llama_no_peft, pred_disc_2_cont_real_name[["id", "pred"]], on = "id", how = "inner")

#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:
###  ## x is is generated  tittle, y is true title

np.mean(df_all_preds_llama_no_peft[df_all_preds_llama_no_peft.proxy != 3].pred_x - df_all_preds_llama_no_peft[df_all_preds_llama_no_peft.proxy != 3].pred_y)


-0.0011023467394448393

In [None]:
permutation_test(df_all_preds_llama_no_peft[df_all_preds_llama_no_peft.proxy != 3].pred_x, df_all_preds_llama_no_peft[df_all_preds_llama_no_peft.proxy != 3].pred_y, 1000)


0.973


## LlaMa Generation 2

### LLama LORA

In [None]:
gen_titles_llama_lora_v2 = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_v2.csv")

In [None]:
# as I have fewer generated titles with LLaMa LLora
airbnb_london_filtered_images_imp_var_titles_with_llama_lora_v2 = pd.merge(airbnb_london_filtered_images_imp_var_titles,  gen_titles_llama_lora_v2, how = "inner", on = "id")

conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles_with_llama_lora_v2)


In [None]:
def collate_batch_counterfactual_llama_lora_v2(batch):

  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_distilbart_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_distilbart_name.append(dict_titles_llama_lora_2[str(data[62])])    # only change here!

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_distilbart_name = torch.tensor(list_distilbart_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_distilbart_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for LLAMA (LoRa) generated titles

In [None]:
dl_counterfactuals_llama_lora_v2 = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_llama_lora_v2, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_llama_lora_v2, True)
preds = [a[0] for a in preds_]

  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_llama_lora_v2 = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})
#cf_llama_lora.to_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/cf_distilbart.csv", index = False)

In [None]:
df_all_preds_llama_lora_v2 = pd.merge(airbnb_london_filtered_images_imp_var_titles_with_llama_lora_v2[["id", "proxy"]], cf_llama_lora_v2, how = "left", on = "id")


In [None]:
df_all_preds_llama_lora_v2 = pd.merge(df_all_preds_llama_lora_v2, pred_disc_2_cont_real_name[["id", "pred"]], on = "id", how = "inner")

#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:
###  ## x is is generated  tittle, y is true title


np.mean(df_all_preds_llama_lora_v2[df_all_preds_llama_lora_v2.proxy != 3].pred_x - df_all_preds_llama_lora_v2[df_all_preds_llama_lora_v2.proxy != 3].pred_y)


0.03048567114083427

Also checking result of permutation test based on proxy value

In [None]:
permutation_test(df_all_preds_llama_lora_v2[df_all_preds_llama_lora_v2.proxy != 3].pred_x, df_all_preds_llama_lora_v2[df_all_preds_llama_lora_v2.proxy != 3].pred_y, 1000)


0.123

### LLama-Adapter

In [None]:
gen_titles_llama_adapter_v2 = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_adapter_v2.csv")


In [None]:
# as I have fewer generated titles with LLaMa LLora
airbnb_london_filtered_images_imp_var_titles_with_llama_adapter_v2 = pd.merge(airbnb_london_filtered_images_imp_var_titles,  gen_titles_llama_adapter_v2, how = "inner", on = "id")
conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles_with_llama_adapter_v2)


In [None]:
def collate_batch_counterfactual_llama_adapter_v2(batch):


  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_llama_adapter_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_llama_adapter_name.append(dict_titles_llama_adapter_2[str(data[62])])    # only change here!

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_llama_adapter_name = torch.tensor(list_llama_adapter_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_llama_adapter_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


#### Calculating counterfactuals for LLAMA-Adapter generated titles


In [None]:
dl_counterfactuals_llama_adapter_v2 = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_llama_adapter_v2, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_llama_adapter_v2, True)
preds = [a[0] for a in preds_]

  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_llama_adapter_v2 = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})


In [None]:
df_all_preds_llama_adapter_v2 = pd.merge(airbnb_london_filtered_images_imp_var_titles_with_llama_adapter_v2[["id", "proxy"]], cf_llama_adapter_v2, how = "left", on = "id")


In [None]:
df_all_preds_llama_adapter_v2 = pd.merge(df_all_preds_llama_adapter_v2, pred_disc_2_cont_real_name[["id", "pred"]], on = "id", how = "inner")

#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:
###  ## x is is generated  tittle, y is true title

np.mean(df_all_preds_llama_adapter_v2[df_all_preds_llama_adapter_v2.proxy != 3].pred_x - df_all_preds_llama_adapter_v2[df_all_preds_llama_adapter_v2.proxy != 3].pred_y)


0.027554424955467793

In [None]:
permutation_test(df_all_preds_llama_adapter_v2[df_all_preds_llama_adapter_v2.proxy != 3].pred_x, df_all_preds_llama_adapter_v2[df_all_preds_llama_adapter_v2.proxy != 3].pred_y, 1000)


0.182

### LLama (no PEFT)

In [None]:
gen_titles_llama_no_peft_v2 = pd.read_csv("/content/gdrive/My Drive/Thesis/loss_data/gen_titles_llama_no_peft_v2.csv")



In [None]:
# as I have fewer generated titles with LLaMa LLora
airbnb_london_filtered_images_imp_var_titles_with_llama_no_peft_v2 = pd.merge(airbnb_london_filtered_images_imp_var_titles,  gen_titles_llama_no_peft_v2, how = "inner", on = "id")
conterfac_dataset  = Simple_Dataset(airbnb_london_filtered_images_imp_var_titles_with_llama_no_peft_v2)


In [None]:
def collate_batch_counterfactual_llama_no_peft_v2(batch):


  """
  Idx_model refers to the location of the genrated names for a specific model
  """

  list_images = []   # for the images passed through extractor function
  tabular_list = []
  list_proxies = []
  list_review_diff = []
  list_llama_adapter_name = []
  list_joint_description = []
  list_ids = []

  for data in batch:

    # indexing the image pixels form the dict
    list_images.append(dict_images[str(data[62])])

    list_ids.append(data[62]) # for saving the predictions

    tabular_list.append(data[:62]) ## locations of tabular data
    if int(data[65]) == 1:  ## location of categorical proxy variable
      list_proxies.append([1,0,0])
    elif int(data[65]) == 2:
      list_proxies.append([0,1,0])
    elif int(data[65]) == 3:
      list_proxies.append([0,0,1])

    list_review_diff.append(data[66])  ## location of cont. review diff variable

    list_joint_description.append(dict_des[str(data[62])])

    list_llama_adapter_name.append(dict_titles_llama_no_peft_2[str(data[62])])    # only change here!

  list_images  = torch.tensor(list_images, dtype=torch.float32)
  list_joint_description = torch.tensor(list_joint_description, dtype=torch.float32)
  list_llama_adapter_name = torch.tensor(list_llama_adapter_name, dtype=torch.float32)
  tabular_list = torch.tensor(tabular_list, dtype=torch.float32)
  list_proxies = torch.tensor(list_proxies, dtype=torch.float32)

  return list_images, list_llama_adapter_name, list_joint_description, tabular_list, list_proxies, list_review_diff, list_ids


# images, names, des, tabular data, proxies, review_diff, ids


### Calculating counterfactuals for LLAMA generated titles (no peft)

In [None]:
dl_counterfactuals_llama_no_peft_v2 = DataLoader(conterfac_dataset, collate_fn=collate_batch_counterfactual_llama_no_peft_v2, batch_size=32, shuffle=False)


In [None]:
Ys, preds_, ids = get_counterfactuals(model_cont, dl_counterfactuals_llama_no_peft_v2, True)
preds = [a[0] for a in preds_]

  name = torch.tensor(batch[1]).to(device)
  joint_des = torch.tensor(batch[2]).to(device)
  encoded_des = torch.tensor(des_encoded)
  encoded_name = torch.tensor(name_encoded)


In [None]:
cf_llama_no_peft_v2 = pd.DataFrame({"id": ids, "pred": preds, "Y": Ys})


In [None]:
df_all_preds_llama_no_peft_v2 = pd.merge(airbnb_london_filtered_images_imp_var_titles_with_llama_no_peft_v2[["id", "proxy"]], cf_llama_no_peft_v2, how = "left", on = "id")


In [None]:
df_all_preds_llama_no_peft_v2 = pd.merge(df_all_preds_llama_no_peft_v2, pred_disc_2_cont_real_name[["id", "pred"]], on = "id", how = "inner")

#### Checking differences of preidcted review diff based on true titles and predicted review diff based on generated titles

In [None]:
###  ## x is is generated  tittle, y is true title

np.mean(df_all_preds_llama_no_peft_v2[df_all_preds_llama_no_peft_v2.proxy != 3].pred_x - df_all_preds_llama_no_peft_v2[df_all_preds_llama_no_peft_v2.proxy != 3].pred_y)


0.010117032491044224

In [None]:
permutation_test(df_all_preds_llama_no_peft_v2[df_all_preds_llama_no_peft_v2.proxy != 3].pred_x, df_all_preds_llama_no_peft_v2[df_all_preds_llama_no_peft_v2.proxy != 3].pred_y, 1000)


0.603

# Joining counterfactuals and titles

In [None]:
# renaming

df_all_preds_distilbart = df_all_preds_distilbart.rename(columns={'pred':'pred_distilbart'})
df_all_preds_peg = df_all_preds_peg.rename(columns={'pred':'pred_pegasus'})
df_all_preds_bart = df_all_preds_bart.rename(columns={'pred':'pred_bart'})

df_all_preds_llama_lora_v2 = df_all_preds_llama_lora_v2.rename(columns={'pred_x':'pred_llama_lora_v2'})
df_all_preds_llama_adapter_v2 = df_all_preds_llama_adapter_v2.rename(columns={'pred_x':'pred_llama_adapter_v2'})
df_all_preds_llama_no_peft_v2 = df_all_preds_llama_no_peft_v2.rename(columns={'pred_x':'pred_llama_no_peft_v2'})

df_all_preds_llama_lora = df_all_preds_llama_lora.rename(columns={'pred_x':'pred_llama_lora'})
df_all_preds_llama_adapter = df_all_preds_llama_adapter.rename(columns={'pred_x':'pred_llama_adapter'})
df_all_preds_llama_no_peft = df_all_preds_llama_no_peft.rename(columns={'pred_x':'pred_llama_no_peft'})

gen_titles_llama_lora_v2 = gen_titles_llama_lora_v2.rename(columns={'gen_title':'gen_title_llama_lora_v2'})
gen_titles_llama_adapter_v2 = gen_titles_llama_adapter_v2.rename(columns={'gen_titles':'gen_titles_llama_adapter_v2'})
gen_titles_llama_no_peft_v2 = gen_titles_llama_no_peft_v2.rename(columns={'gen_title':'gen_titles_llama_no_peft_v2'})


gen_titles_llama_lora = gen_titles_llama_lora.rename(columns={'gen_title':'gen_title_llama_lora'})
gen_titles_llama_adapter = gen_titles_llama_adapter.rename(columns={'gen_titles':'gen_titles_llama_adapter'})
gen_titles_llama_no_peft = gen_titles_llama_no_peft.rename(columns={'gen_title':'gen_titles_llama_no_peft'})


In [None]:
## merging couterfactual predictions

merge_prep = pd.merge(df_all_preds_distilbart[["id", "pred_distilbart", "proxy"]], df_all_preds_peg[["id", "pred_pegasus"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep,df_all_preds_bart[["id", "pred_bart"]], how = "inner", on  = "id")


merge_prep = pd.merge(merge_prep, df_all_preds_llama_lora[["id", "pred_llama_lora"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep, df_all_preds_llama_adapter[["id", "pred_llama_adapter"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep, df_all_preds_llama_no_peft[["id", "pred_llama_no_peft"]], how = "inner", on  = "id")

merge_prep = pd.merge(merge_prep, df_all_preds_llama_lora_v2[["id", "pred_llama_lora_v2"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep, df_all_preds_llama_adapter_v2[["id", "pred_llama_adapter_v2"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep, df_all_preds_llama_no_peft_v2[["id", "pred_llama_no_peft_v2"]], how = "inner", on  = "id")

merge_prep = pd.merge(merge_prep, airbnb_london_filtered_images_imp_var[["id", "name"]], how = "inner", on  = "id")

## merging together with generated titles using summarization methods
merge_prep = pd.merge(merge_prep, gen_titles_summ_models[['id','gen_titles_distilbart', 'gen_titles_bart', 'gen_titles_pegasus']], how = "inner", on  = "id")

## merging together with llama geenerated titles
merge_prep = pd.merge(merge_prep, gen_titles_llama_lora[["id", "gen_title_llama_lora"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep, gen_titles_llama_no_peft[["id", "gen_titles_llama_no_peft"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep, gen_titles_llama_adapter[["id", "gen_titles_llama_adapter"]], how = "inner", on  = "id")

merge_prep = pd.merge(merge_prep, gen_titles_llama_lora_v2[["id", "gen_title_llama_lora_v2"]], how = "inner", on  = "id")
merge_prep = pd.merge(merge_prep, gen_titles_llama_no_peft_v2[["id", "gen_titles_llama_no_peft_v2"]], how = "inner", on  = "id")


merged_counterfactuals_titles = pd.merge(merge_prep, gen_titles_llama_adapter_v2[["id", "gen_titles_llama_adapter_v2"]], how = "inner", on  = "id")


In [None]:
# saving to csv
merged_counterfactuals_titles.to_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/merged_counterfactuals_titles.csv", index = False)


#### Finally, printing examples to be added to the apppendix of the thesis:

In [None]:
from pandas import DataFrame
from IPython.display import HTML

In [73]:
merged_counterfactuals_titles = pd.read_csv("/content/gdrive/My Drive/Thesis/Discriminator_Predictions/merged_counterfactuals_titles.csv")
airbnb_london_filtered_images_imp_var = pd.read_csv("/content/gdrive/My Drive/Thesis/London_Data/airbnb_london_filtered_images_counterfactual_prep.csv")


In [74]:

merged_counterfactuals_titles_imp = merged_counterfactuals_titles[['id','name', 'gen_titles_distilbart', 'gen_titles_bart',
       'gen_titles_pegasus', 'gen_titles_llama_lora',
       'gen_titles_llama_adapter', 'gen_titles_llama_no_peft',
       'gen_title_llama_lora_v2', 'gen_titles_llama_no_peft_v2',
       'gen_titles_llama_adapter_v2']]

data_examples = pd.merge(merged_counterfactuals_titles_imp, airbnb_london_filtered_images_imp_var[['id', 'joint_description']], on = "id", how = "left").sample(n=1, axis = 0, random_state = 101)


In [76]:
from pandas import DataFrame
from IPython.display import HTML
HTML(data_examples.to_html())

Unnamed: 0,id,name,gen_titles_distilbart,gen_titles_bart,gen_titles_pegasus,gen_titles_llama_lora,gen_titles_llama_adapter,gen_titles_llama_no_peft,gen_title_llama_lora_v2,gen_titles_llama_no_peft_v2,gen_titles_llama_adapter_v2,joint_description
951,10807880,Double bedroom in a cosy house with garden,Cosy double bedroom in peaceful house,Cosy double bedroom in a 2 bedroom house,Cosy and warm double bedroom,"Cozy, warm and comfy double bedroom",Cosy double bedroom in a 2 bedroom house,Victoria Station Tranquil House,Peaceful and Tranquil Double Bedroom in Central London.,"Victoria Station, Tranquil Area, Private Garden.",Cozy and Warm Double Bedroom in a Tranquil Area with Nature Reserve Nearby.,"25 minute connection to Central London, Victoria station. Cosy and warm double bedroom. Two bedroom house located in a peaceful and tranquil area with nature reserve within 3 minute walk. Private garden, living room, 2 separate toilets and bathroom. We are social and like having chats, but if you don’t we don’t mind and leave you to relax. Guest access Access to garden as well. Other things to note We have a medium sized dog with us. But, he is friendly and calm. We also have a 2 year old boy. But, very sociable and playful. . Close to nature reserve woods and moated manor on one side. But on the other side the train station within 12 minutes walk and also a shopping centre from station another 4 minutes walk."
