# 0. Imports

In [None]:
#!pip install ceml
!pip install geonamescache
!pip install pyswarms

Collecting geonamescache
[?25l  Downloading https://files.pythonhosted.org/packages/ab/ba/b7939087621bfeb24c0f52c4b879865a9f902cda72efd119f4275400e692/geonamescache-1.2.0-py3-none-any.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 10.1MB/s 
[?25hInstalling collected packages: geonamescache
Successfully installed geonamescache-1.2.0
Collecting pyswarms
[?25l  Downloading https://files.pythonhosted.org/packages/d1/fd/5c2baba82425b75baf7dbec5af57219cd252aa8a1ace4f5cd1d88e472276/pyswarms-1.3.0-py2.py3-none-any.whl (104kB)
[K     |████████████████████████████████| 112kB 15.7MB/s 
Installing collected packages: pyswarms
Successfully installed pyswarms-1.3.0


In [None]:
import csv
import math
import numpy as np
import pandas as pd
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from google.colab import drive
from tqdm import tqdm
from random import random, seed, shuffle
from sklearn.neighbors import radius_neighbors_graph
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from geopy.geocoders import Nominatim
from copy import deepcopy
from sklearn.tree import DecisionTreeRegressor
import pyswarms as ps
from pyswarms.utils.functions import single_obj as fx

seed(1)
torch.manual_seed(1)

## 0.1 Interm results
Load interm results if not running entire file

In [None]:
drive.mount('/content/gdrive')
load_data = True
load_models = True
load_M = True
load_transforms = True
load_augmented_instances = True
experiment_number = 2

folder_path = F"/content/gdrive/My Drive/Auto/"
data1_fp = folder_path+'data1.npy'
data2_fp = folder_path+'data2.npy'
model1_input_fp = folder_path+'model1_input.npy'
model2_input_fp = folder_path+'model2_input.npy'
model1_fp = folder_path + 'model1.h5'
model2_fp = folder_path + 'model2.h5'

if experiment_number == 1:
  m_fp = folder_path + 'M.npz'
else:
   m_fp = folder_path + 'M2.npz'
   augmented_instances_fp = folder_path + 'Augmented_instances2.npz'

if load_data:
  data1 = np.load(data1_fp)
  data2 = np.load(data2_fp)
if load_transforms:
  model1_input = np.load(model1_input_fp)
  model2_input = np.load(model2_input_fp)
if load_M:
  M_ = np.load(m_fp, allow_pickle=True)
  M = [M_['arr_0'], M_['arr_1']]
if load_augmented_instances:
    augmented_instances_ = np.load(augmented_instances_fp, allow_pickle=True)
    augmented_instances = [augmented_instances_['arr_0'], augmented_instances_['arr_1']]

Mounted at /content/gdrive


# 1. Load & Adjust Datasets

## 1.1 Load data and basic cleaning

Two different datasets under the same domain, with the same label, and overlapping features.

Dataset 1 is collected from AutoExport.com, and aggregated here:
https://www.kaggle.com/doaaalsenani/usa-cers-dataset

Dataset 2 is collected from Craigslist.com, and is aggregated here:
https://www.kaggle.com/austinreese/craigslist-carstrucks-data


In [None]:
folder_path = F"/content/gdrive/My Drive/Auto/"
dataset1_save_name = 'Dataset1.csv'
dataset2_save_name = 'Dataset2.csv'
drive.mount('/content/gdrive')

dataset1 = pd.read_csv(folder_path + dataset1_save_name)
dataset2 = pd.read_csv(folder_path + dataset2_save_name)

# Drop nominal or redundant data
dataset1 = dataset1.drop(['vin', 'lot', 'condition', 'Unnamed: 0'], axis=1).dropna()
#dataset2 = dataset2.drop(['id', 'county', 'VIN', 'image_url', 'description', 'state', 'url', 'region', 'region_url'], axis=1).dropna()
dataset2 = dataset2.drop(['id', 'county', 'VIN', 'image_url', 'description', 'lat', 'long', 'url', 'region_url'], axis=1).dropna()
# Drop unlabeled entries
dataset1 = dataset1[dataset1['price'] > 0]
dataset2 = dataset2[dataset2['price'] > 0]

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


2021-06-04 01:29:18,663 - numexpr.utils - INFO - NumExpr defaulting to 4 threads.


In [None]:
dataset1.head()

Unnamed: 0,price,brand,model,year,title_status,mileage,color,state,country
0,6300,toyota,cruiser,2008,clean vehicle,274117.0,black,new jersey,usa
1,2899,ford,se,2011,clean vehicle,190552.0,silver,tennessee,usa
2,5350,dodge,mpv,2018,clean vehicle,39590.0,silver,georgia,usa
3,25000,ford,door,2014,clean vehicle,64146.0,blue,virginia,usa
4,27700,chevrolet,1500,2018,clean vehicle,6654.0,red,florida,usa


In [None]:
dataset2.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,auburn,9500,2003.0,chrysler,town & country,excellent,6 cylinders,gas,30376.0,clean,automatic,fwd,mid-size,mini-van,blue,al
11,auburn,18250,2008.0,toyota,fj cruiser,good,6 cylinders,gas,201300.0,clean,manual,4wd,full-size,offroad,black,al
21,auburn,27995,2019.0,nissan,frontier,like new,6 cylinders,gas,4527.0,clean,automatic,rwd,full-size,truck,red,al
33,auburn,4000,2006.0,jeep,grand cherokee laredo,good,6 cylinders,gas,281000.0,clean,automatic,rwd,mid-size,SUV,black,al
83,auburn,5000,2014.0,honda,civic,good,4 cylinders,gas,170000.0,clean,manual,fwd,compact,coupe,silver,al


## 1.2 Transform Data

### 1.2.1 Dataset 1
___
Create one hot encodings for the categorical & interval data in dataset 1

In [None]:
us_state_abbrev = {
    'alabama': 'al',
    'alaska': 'ak',
    'american samoa': 'as',
    'arizona': 'az',
    'arkansas': 'ar',
    'california': 'ca',
    'colorado': 'co',
    'connecticut': 'ct',
    'delaware': 'de',
    'district of columbia': 'dc',
    'florida': 'fl',
    'georgia': 'ga',
    'guam': 'gu',
    'hawaii': 'hi',
    'idaho': 'id',
    'illinois': 'il',
    'indiana': 'in',
    'iowa': 'ia',
    'kansas': 'ks',
    'kentucky': 'ky',
    'louisiana': 'la',
    'maine': 'me',
    'maryland': 'md',
    'massachusetts': 'ma',
    'michigan': 'mi',
    'minnesota': 'mn',
    'mississippi': 'ms',
    'missouri': 'mo',
    'montana': 'mt',
    'nebraska': 'ne',
    'nevada': 'nv',
    'new hampshire': 'nh',
    'new jersey': 'nj',
    'new mexico': 'nm',
    'new york': 'ny',
    'north carolina': 'nc',
    'north dakota': 'nd',
    'northern mariana islands':'mp',
    'ohio': 'oh',
    'oklahoma': 'ok',
    'oregon': 'or',
    'pennsylvania': 'pa',
    'puerto rico': 'pr',
    'rhode island': 'ri',
    'south carolina': 'sc',
    'south dakota': 'sd',
    'tennessee': 'tn',
    'texas': 'tx',
    'utah': 'ut',
    'vermont': 'vt',
    'virgin islands': 'vi',
    'virginia': 'va',
    'washington': 'wa',
    'west virginia': 'wv',
    'wisconsin': 'wi',
    'wyoming': 'wy',
    'alberta'	: 'ab',
    'british columbia' : 'bc',
    'manitoba' :	'mb',
    'new brunswick'	: 'nb',
    'newfoundland and labrador'	: 'nl',
    'northwest territories'	: 'nt',
    'nova scotia' :	'ns',
    'nunavut':'nu',
    'ontario':'on',
    'prince edward island':'pe',
    'quebec':'qc',
    'saskatchewan':'sk',
    'yukon':'yt'
}

dataset1.state = dataset1.state.apply(lambda x: us_state_abbrev[x])

# State to latitude and longitude
# This is something to consider. See paper for justification of one-hot encoding vs quantitative variables
'''g = Nominatim()
lat = []
lon = []
state_to_lat_lon = {}
for index, row in tqdm(dataset1.iterrows()):
  state = row.state
  if state not in state_to_lat_lon.keys():
    gc = g.geocode(state)
    state_to_lat_lon[state] = (gc[1][0], gc[1][1])
  lat.append(state_to_lat_lon[state][0])
  lon.append(state_to_lat_lon[state][1])'''

# Descritize categorical & interval data
def descritize(col_name, var_to_one_hot=None):
  one_hot = []
  if var_to_one_hot == None:
    var_to_one_hot = {}
  counter = len(var_to_one_hot.keys())
  for index, row in (dataset1.iterrows()):
    var = row[col_name]
    if var not in var_to_one_hot.keys():
      var_to_one_hot[var] = counter
      counter += 1
    one_hot.append(var_to_one_hot[var])
  return one_hot, var_to_one_hot

def one_hot(row_vals, val_to_index):
  one_hot = [list(val_to_index.keys())]
  for i, row in enumerate(row_vals):
    one_hot.append([0]*len(one_hot[0]))
    one_hot[i+1][row_vals[i]] = 1
  return one_hot

# Descritize of state
state_desc, state_to_desc = descritize('state')
# Descritize country
country_desc, country_to_desc = descritize('country')
# Descritize color
color_desc, color_to_desc = descritize('color')
# Descritize title_status
title_status_desc, title_status_to_desc = descritize('title_status')
# Descritize year
year_desc, year_to_desc = descritize('year')
# Descritize make
make_desc, make_to_desc = descritize('brand')
# Descritize model
model_desc, model_to_desc = descritize('model')

# One hot encode states
state_one_hot = one_hot(state_desc, state_to_desc)
# One hot encode country
country_one_hot = one_hot(country_desc, country_to_desc)
# One hot encode color
color_one_hot = one_hot(color_desc, color_to_desc)
# One hot encode title_status
title_status_one_hot = one_hot(title_status_desc, title_status_to_desc)
# One hot encode year
year_one_hot = one_hot(year_desc, year_to_desc)
# One hot encode brand
make_one_hot = one_hot(make_desc, make_to_desc)
# One hot encode model
model_one_hot = one_hot(model_desc, model_to_desc)

dataset1.mileage = dataset1.mileage / dataset1.mileage.max()

### 1.2.2 Dataset 2
___
Process dataset 2 in the same way

In [None]:
# State to latitude and longitude
# This is something to consider. See paper for justification of one-hot encoding vs quantitative variables
# Descritize categorical & interval data
def descritize(col_name, var_to_one_hot=None):
  one_hot = []
  if var_to_one_hot == None:
    var_to_one_hot = {}
  counter = len(var_to_one_hot.keys())
  for index, row in (dataset2.iterrows()):
    var = row[col_name]
    if var not in var_to_one_hot.keys():
      var_to_one_hot[var] = counter
      counter += 1
    one_hot.append(var_to_one_hot[var])
  return one_hot, var_to_one_hot

# Descritize of state
state_desc, state_to_desc = descritize('state')
# Descritize condition
condition_desc, condition_to_desc = descritize('condition')
# Descritize cylinders
cylinders_desc, cylinders_to_desc = descritize('cylinders')
# Descritize region
region_desc, region_to_desc = descritize('region')
# Descritize fuel
fuel_desc, fuel_to_desc = descritize('fuel')
# Descritize transmission
transmission_desc, transmission_to_desc = descritize('transmission')
# Descritize drive
drive_desc, drive_to_desc = descritize('drive')
# Descritize size
size_desc, size_to_desc = descritize('size')
# Descritize type
type_desc, type_to_desc = descritize('type')
# Descritize color
color_desc, color_to_desc = descritize('paint_color', color_to_desc)
# Descritize title_status
title_status_desc, title_status_to_desc = descritize('title_status', title_status_to_desc)
# Descritize year
year_desc, year_to_desc = descritize('year', year_to_desc)
# Descritize make
make_desc, make_to_desc = descritize('manufacturer', make_to_desc)
# Descritize model
model_desc, model_to_desc = descritize('model')

# One hot encode states
state2_one_hot = one_hot(state_desc, state_to_desc)
# One hot encode color
color2_one_hot = one_hot(color_desc, color_to_desc)
# One hot encode title_status
title_status2_one_hot = one_hot(title_status_desc, title_status_to_desc)
# One hot encode year
year2_one_hot = one_hot(year_desc, year_to_desc)
# One hot encode brand
make2_one_hot = one_hot(make_desc, make_to_desc)
# One hot encode model
model2_one_hot = one_hot(model_desc, model_to_desc)
# One hot encode transmission
transmission2_one_hot = one_hot(transmission_desc, transmission_to_desc)
# One hot encode fuel
fuel2_one_hot = one_hot(fuel_desc, fuel_to_desc)
# One hot encode size
size2_one_hot = one_hot(size_desc, size_to_desc)
# One hot encode type
type2_one_hot = one_hot(type_desc, type_to_desc)
# One hot encode condition
condition2_one_hot = one_hot(condition_desc, condition_to_desc)
# One hot encode cylinders
cylinders2_one_hot = one_hot(cylinders_desc, cylinders_to_desc)
# One hot encode region
region2_one_hot = one_hot(region_desc, region_to_desc)
# One hot encode drive
drive2_one_hot = one_hot(drive_desc, drive_to_desc)

dataset2.odometer = dataset2.odometer / dataset2.odometer.max()

In [None]:
# Remove less popular types (otherwise memory crashes, network bulky)
one_hots = [state2_one_hot, color2_one_hot, transmission2_one_hot,
               title_status2_one_hot, year2_one_hot, make2_one_hot,
               model2_one_hot, fuel2_one_hot, size2_one_hot, type2_one_hot,
               condition2_one_hot, cylinders2_one_hot, region2_one_hot,
               drive2_one_hot]
for i, l in enumerate(one_hots):
  if len(l[0]) > 128:
    encodings = np.asarray(l[1:])
    unpopular = np.flip(np.sort(np.argsort(np.sum(encodings, axis=0))[0:-128]))
    for el in l:
      for loser in unpopular:
        del el[loser]

### 1.2.3 Add encodings as columns to the dataset

In [None]:
def add_encodings(list_of_encodings, remove_columns, df):
  df_copy = df.copy()
  df_copy = df_copy.drop(remove_columns, axis=1)
  for l in list_of_encodings:
    # Separate encodings from labels
    encodings = np.asarray(l[1:])
    for i, col in enumerate(l[0]):
      df_copy[col] = encodings[:,i]
  return df_copy

df1 = add_encodings([state_one_hot, country_one_hot, color_one_hot,
               title_status_one_hot, year_one_hot, make_one_hot,
               model_one_hot], ['state', 'country', 'color', 'title_status',
                                'year', 'brand', 'model'], dataset1)
df2 = add_encodings([state2_one_hot, color2_one_hot, transmission2_one_hot,
               title_status2_one_hot, year2_one_hot, make2_one_hot,
               model2_one_hot, fuel2_one_hot, size2_one_hot, type2_one_hot,
               condition2_one_hot, cylinders2_one_hot, region2_one_hot,
               drive2_one_hot], ['state', 'paint_color', 'title_status',
                                 'transmission', 'year', 'manufacturer',
                                 'model', 'fuel', 'size', 'type', 'condition',
                                 'cylinders', 'region', 'drive'], dataset2)

View dataset headers

In [None]:
df1.head()

Unnamed: 0,price,mileage,nj,tn,ga,va,fl,tx,ca,nc,oh,ny,pa,sc,mi,wa,az,ky,ma,ne,on,mo,mn,ct,ar,co,il,ms,md,ut,wi,ok,or,in,wv,nv,ks,ri,la,al,...,fusion,mustang,passenger,volt,spark,cruze,ld,journey,transit,ranger,taurus,max,energi,expedition,bus,ecosport,f-750,d,dr,hybrid,suv,connect,f-650,sentra,altima,frontier,rogue,maxima,versa,note,armada,pathfinder,titan,sedan,juke,murano,xterra,kicks,xd,nvp
0,6300,0.269287,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2899,0.187194,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,5350,0.038892,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,25000,0.063016,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,27700,0.006537,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df2.head()

Unnamed: 0,price,odometer,al,ak,az,ar,ca,co,ct,dc,de,fl,ga,hi,id,il,in,ia,ks,ky,la,me,md,ma,mi,mn,ms,mo,mt,nc,ne,nv,nj,nm,ny,nh,nd,oh,ok,or,...,akron / canton,cincinnati,cleveland,dayton / springfield,oklahoma city,tulsa,bend,eugene,medford-ashland,portland,salem,lehigh valley,philadelphia,pittsburgh,scranton / wilkes-barre,rhode island,charleston,greenville / upstate,knoxville,memphis,nashville,austin,dallas / fort worth,el paso,houston,mcallen / edinburg,san antonio,vermont,fredericksburg,norfolk / hampton roads,roanoke,seattle-tacoma,appleton-oshkosh-FDL,eau claire,green bay,madison,milwaukee,fwd,4wd,rwd
0,9500,0.003038,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
11,18250,0.02013,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
21,27995,0.000453,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
33,4000,0.0281,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
83,5000,0.017,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


### 1.2.4 Turn into numpy arrays

___

Faciliatates training



In [None]:
data1 = df1.values
data2 = df2.values
np.save(folder_path+'data1',data1)
np.save(folder_path+'data2',data2)

# 2. Train Models

For experiment 1, we train two neural networks of different architectures for the binary classification task of determining if the price of a car is more or less than 20k

## 2.1 Define network architecture

For dataset 1 (feature size 275), two layers of dense neurons. 512 and 2048 neurons in each. LeakyReulu activation function, sigmoid on output.

For dataset 2 (feature size 594), two layers of dense neurons. 1024 and 5096 in each. LeakyReulu activation function, sigmoid on output.

In [None]:
# For dataset 1
class Net1(nn.Module):

    def __init__(self):
        super(Net1, self).__init__()
        self.fc1 = nn.Linear(275, 512)
        self.fc2 = nn.Linear(512, 1)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x.float()))
        x = F.leaky_relu(self.fc2(x))
        return torch.sigmoid(x)

# For dataset 2
class Net2(nn.Module):

    def __init__(self):
        super(Net2, self).__init__()
        self.fc1 = nn.Linear(549, 1024)
        self.fc2 = nn.Linear(1024, 1)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x.float()))
        x = self.fc2(x)
        return torch.sigmoid(x)


def train(model, device, train_loader, test_loader, optimizer, epochs,
          criterion, underperformance_threshold=20):
  best_model = deepcopy(model)
  best_acc = 0.0
  underperformance_counter = 0
  for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        labels = labels.reshape(-1, 1)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

    incorrect = 0
    total = 0
    for i, data in enumerate(test_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        labels = labels.reshape(-1, 1)
        outputs = torch.round(model(inputs))
        incorrect += torch.sum(torch.abs(labels - outputs))
        total += outputs.size()[0]
    val_acc = (total-incorrect)/(total)
    print('Validation set accuracy on epoch ', epoch, ': ', val_acc)
    if val_acc > best_acc:
      best_model = deepcopy(deepcopy)
      best_acc = val_acc
      underperformance_counter = 0
    else:
      underperformance_counter += 1
      if underperformance_counter > underperformance_threshold:
        print('EARLY STOPPING')
        model = best_model
        break
  model = best_model

## 2.2 Process data into training and testing sets

In [None]:
data1_ = torch.from_numpy(deepcopy(data1))
data2_ = torch.from_numpy(deepcopy(data2))
X1 = data1_[:,1:]
y1 = data1_[:,0]

X2 = data2_[:,1:]
y2 = data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [y1, y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33,
                                                        random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33,
                                                        random_state=0)

batchsize = 32
lr = 1.0
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainloader1 = list(zip(np.array_split(X1_train, X1_train.shape[0]//batchsize),
                        np.array_split(y1_train, X1_train.shape[0]//batchsize)))
trainloader2 = list(zip(np.array_split(X2_train, X2_train.shape[0]//batchsize),
                        np.array_split(y2_train, X2_train.shape[0]//batchsize)))
testloader1 = list(zip(np.array_split(X1_test, X1_test.shape[0]//batchsize),
                       np.array_split(y1_test, X1_test.shape[0]//batchsize)))
testloader2 = list(zip(np.array_split(X2_test, X2_test.shape[0]//batchsize),
                       np.array_split(y2_test, X2_test.shape[0]//batchsize)))

In [None]:
[(data, label), (data, label), (data, label), ...]

In [None]:
model1 = Net1()
model2 = Net2()

optimizer1 = optim.Adam(model1.parameters(), lr=0.00001)
optimizer2 = optim.Adam(model2.parameters(), lr=0.001)

criterion1 = nn.BCELoss()
criterion2 = nn.BCELoss()

## 2.3 Train models

In [None]:
train(model1, device, trainloader1, testloader1, optimizer1, 1000, criterion1,
      underperformance_threshold=100)

Validation set accuracy on epoch  0 :  tensor(0.4599, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  1 :  tensor(0.4883, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  2 :  tensor(0.5203, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  3 :  tensor(0.5413, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  4 :  tensor(0.5524, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  5 :  tensor(0.5549, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  6 :  tensor(0.5573, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  7 :  tensor(0.5586, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  8 :  tensor(0.5672, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  9 :  tensor(0.5672, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation

In [None]:
train(model1, device, trainloader1, testloader1, optimizer1, 1000, criterion1,
      underperformance_threshold=100)

Validation set accuracy on epoch  0 :  tensor(0.5623, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  1 :  tensor(0.5647, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  2 :  tensor(0.5647, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  3 :  tensor(0.5672, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  4 :  tensor(0.5672, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  5 :  tensor(0.5672, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  6 :  tensor(0.5660, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  7 :  tensor(0.5647, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  8 :  tensor(0.5635, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  9 :  tensor(0.5721, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation

In [None]:
train(model2, device, trainloader2, testloader2, optimizer2, 1000, criterion2)

Validation set accuracy on epoch  0 :  tensor(0.9258, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  1 :  tensor(0.9356, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  2 :  tensor(0.9387, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  3 :  tensor(0.9395, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  4 :  tensor(0.9399, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  5 :  tensor(0.9410, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  6 :  tensor(0.9406, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  7 :  tensor(0.9405, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  8 :  tensor(0.9404, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  9 :  tensor(0.9397, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation

In [None]:
train(model2, device, trainloader2, testloader2, optimizer2, 1000, criterion2)

Validation set accuracy on epoch  0 :  tensor(0.9265, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  1 :  tensor(0.9357, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  2 :  tensor(0.9383, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  3 :  tensor(0.9395, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  4 :  tensor(0.9391, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  5 :  tensor(0.9384, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  6 :  tensor(0.9387, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  7 :  tensor(0.9395, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  8 :  tensor(0.9392, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  9 :  tensor(0.9381, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation

## 2.4 Test models

We bring in the test dataset to judge the generalizability of both of the models. Third (test) dataset is a random subset of 10,000 rows of the cargurus used car dataset.

### 2.4.1 Create testset
Import dataset, remove nominal data, drop NaN's

In [None]:
dataset3_save_name = 'Dataset3.csv'

dataset3 = pd.read_csv(folder_path + dataset3_save_name)

# Drop nominal or redundant data
dataset3 = dataset3.drop(['Unnamed: 0', 'vin','back_legroom','bed','bed_height','bed_length','cabin','city_fuel_economy','combine_fuel_economy','daysonmarket','dealer_zip','description','engine_displacement','engine_type','fleet','frame_damaged','franchise_dealer','franchise_make','front_legroom','fuel_tank_volume','has_accidents','height','highway_fuel_economy','horsepower','interior_color','isCab','is_certified','is_cpo','is_new','is_oemcpo','latitude','length','listed_date','listing_color','listing_id','longitude','main_picture_url','major_options','maximum_seating','owner_count','power','savings_amount','seller_rating','sp_id','sp_name','theft_title','torque','transmission_display','trimId','trim_name','vehicle_damage_category','wheel_system_display','wheelbase','width'], axis=1).dropna()
dataset3.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,body_type,city,engine_cylinders,exterior_color,fuel_type,make_name,mileage,model_name,price,salvage,transmission,wheel_system,year
0,SUV / Crossover,Woodbury,I6,Alpine White,Gasoline,BMW,87399.0,X6,16895.0,False,A,AWD,2013
1,Sedan,Bohemia,V8,Gray,Gasoline,Mercedes-Benz,55466.0,E-Class,46529.0,False,A,AWD,2015
2,Sedan,East Hartford,V6,Torred Clearcoat,Gasoline,Dodge,32812.0,Charger,26993.0,False,A,AWD,2017
5,Sedan,Great Neck,V6,Black,Gasoline,Mercedes-Benz,42213.0,E-Class,43495.0,False,A,AWD,2017
6,Sedan,Great Neck,I4,Black,Gasoline,Mercedes-Benz,30850.0,E-Class,29495.0,False,A,AWD,2017


In [None]:
# Conversion dictionaries to make test set comaptible with train sets
body_type_conversion = {'SUV / Crossover': 'SUV', 'Sedan': 'sedan', 'Pickup Truck': 'pickup', 'Van': 'van', 'Coupe': 'coupe', 'Hatchback': 'hatchback', 'Convertible': 'convertible', 'Minivan': 'mini-van'}
colors = list(set(dataset1.color.unique().tolist()).union(dataset2.paint_color.unique().tolist()))
fuel_conversion = {'Gasonline': 'gas', 'Biodiesel':'other', 'Diesel':'diesel', 'Hybrid': 'hybrid', 'Flex Fuel Vehicle':'other'}
salvage_conversion = {False:'clean', True:'salvage'}
transmission_conversion = {'A': 'automatic', 'CVT':'other', 'M':'manual', 'Dual Clutch': 'other'}
'''AWD and 4WD are different, but grouped together here because they are similar
and this helps the datasets to be compatible'''
wheel_conversion = {'AWD': '4wd', 'FWD': 'fwd', '4WD': '4wd', 'RWD': 'rwd', '4x2':'fwd'}

def descritize(col_name, var_to_one_hot=None):
  one_hot = []
  if var_to_one_hot == None:
    var_to_one_hot = {}
  counter = len(var_to_one_hot.keys())
  for index, row in (dataset3.iterrows()):
    var = row[col_name]
    if var not in var_to_one_hot.keys():
      var_to_one_hot[var] = counter
      counter += 1
    one_hot.append(var_to_one_hot[var])
  return one_hot, var_to_one_hot

In [None]:
# Replace values in columns to align better with other datasets
dataset3 = dataset3.replace({'body_type': body_type_conversion,
                  'fuel_type': fuel_conversion, 'salvage' : salvage_conversion,
                  'transmission': transmission_conversion,
                  'wheel_system': wheel_conversion,
                  'salvage': salvage_conversion})
dataset3.make_name = dataset3.make_name.str.lower()
dataset3.model_name = dataset3.model_name.str.lower()

def convert_cylinders(s):
  return s.split(' ')[0][1:] + ' cylinders'

def convert_colors(c):
  for color in colors:
    if color in c.lower():
      return color

dataset3.engine_cylinders = dataset3.engine_cylinders.apply(convert_cylinders)
dataset3.exterior_color = dataset3.exterior_color.apply(convert_colors)

In [None]:
dataset3.head()

Unnamed: 0,body_type,city,engine_cylinders,exterior_color,fuel_type,make_name,mileage,model_name,price,salvage,transmission,wheel_system,year
0,SUV,Woodbury,6 cylinders,white,Gasoline,bmw,87399.0,x6,16895.0,clean,automatic,4wd,2013
1,sedan,Bohemia,8 cylinders,gray,Gasoline,mercedes-benz,55466.0,e-class,46529.0,clean,automatic,4wd,2015
2,sedan,East Hartford,6 cylinders,red,Gasoline,dodge,32812.0,charger,26993.0,clean,automatic,4wd,2017
5,sedan,Great Neck,6 cylinders,black,Gasoline,mercedes-benz,42213.0,e-class,43495.0,clean,automatic,4wd,2017
6,sedan,Great Neck,4 cylinders,black,Gasoline,mercedes-benz,30850.0,e-class,29495.0,clean,automatic,4wd,2017


In [None]:
# Descritize body
body_desc, body_to_desc = descritize('body_type')
# Descritize cylinders
cylinders_desc, cylinders_to_desc = descritize('engine_cylinders')
# Descritize color
color_desc, color_to_desc = descritize('exterior_color')
# Descritize fuel_type
fuel_desc, fuel_to_desc = descritize('fuel_type')
# Descritize make
make_desc, make_to_desc = descritize('make_name')
# Descritize model
model_desc, model_to_desc = descritize('model_name')
# Descritize salvage
salvage_desc, salvage_to_desc = descritize('salvage')
# Descritize transmission
transmission_desc, transmission_to_desc = descritize('transmission')
# Descritize wheel_system
wheel_system_desc, wheel_system_to_desc = descritize('wheel_system')
# Descritize year
year_desc, year_to_desc = descritize('year')

# One hot encode body
body_one_hot = one_hot(body_desc, body_to_desc)
# One hot encode cylinders
cylinders_one_hot = one_hot(cylinders_desc, cylinders_to_desc)
# One hot encode color
color_one_hot = one_hot(color_desc, color_to_desc)
# One hot encode fuel
fuel_one_hot = one_hot(fuel_desc, fuel_to_desc)
# One hot encode make
make_one_hot = one_hot(make_desc, make_to_desc)
# One hot encode model
model_one_hot = one_hot(model_desc, model_to_desc)
# One hot encode salvage
salvage_one_hot = one_hot(salvage_desc, salvage_to_desc)
# One hot encode transmission
transmission_one_hot = one_hot(transmission_desc, transmission_to_desc)
# One hot encode color
wheel_one_hot = one_hot(wheel_system_desc, wheel_system_to_desc)
# One hot encode fuel
year_one_hot = one_hot(year_desc, year_to_desc)

In [None]:
one_hots = [body_one_hot, cylinders_one_hot, color_one_hot,
               fuel_one_hot, year_one_hot, make_one_hot,
               model_one_hot, salvage_one_hot, transmission_one_hot,
               wheel_one_hot]

df3 = add_encodings(one_hots, ['body_type', 'engine_cylinders',
                               'exterior_color', 'fuel_type', 'make_name',
                                'model_name', 'salvage', 'transmission',
                                'wheel_system', 'year'], dataset3)

Get states from cities. Code leveraged from Alexander Galea:
https://galeascience.wordpress.com/2016/03/23/us-city-to-state-python-dictionary/


In [None]:
input_file = csv.DictReader(open(folder_path + 'city_to_state.csv'))

reader = csv.reader(open(folder_path + 'city_to_state.csv'))
city_to_state = {}
for row in reader:
   k, v = row
   city_to_state[k] = v

In [None]:
states = []
unknowns = 0
for city in df3.city:
  try:
    states.append(city_to_state[city].lower())
  except:
    # 19% of cities are of unknown states
    states.append('unknown state')
    unknowns += 1
dataset3['state'] = states

# Descritize states
state_desc, state_to_desc = descritize('state')
# One hot encode states
states_one_hot = one_hot(state_desc, state_to_desc)
# Descritize states
city_desc, city_to_desc = descritize('city')
# One hot encode city
city_one_hot = one_hot(city_desc, city_to_desc)

df3.city = df3.city.str.lower()
df3 = add_encodings([states_one_hot, city_one_hot], ['city'], df3)

In [None]:
df3.head()

Unnamed: 0,mileage,price,SUV,sedan,pickup,van,coupe,hatchback,mini-van,Wagon,convertible,6 cylinders,8 cylinders,4 cylinders,10 cylinders,12 cylinders,5 cylinders,3 cylinders,2 cylinders,white,gray,red,black,phantom black,blue,NaN,silver,gold,magnetic metallic,super black,green,grey,brown,charcoal,ingot silver metallic,black clearcoat,tan,yellow,ruby red metallic tinted clearcoat,burgundy,...,Hollister,Downey,La Crescenta,Seaside,Salinas,Turlock,Watsonville,Santa Barbara,Canyon Country,Atascadero,Santa Cruz,Modesto,Stockton,Tracy,Fremont,Pleasanton,Campbell,Livermore,Santa Clara,Rio Vista,Hayward,San Mateo,Walnut Creek,San Carlos,Pittsburg,Palo Alto,Antioch,Belmont,San Bruno,Colma,Daly City,Berkeley,San Rafael,Petaluma,Vacaville,Napa,Santa Rosa,Lakeport,Ukiah,Vallejo
0,87399.0,16895.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,55466.0,46529.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,32812.0,26993.0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,42213.0,43495.0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,30850.0,29495.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### 2.4.2 Process dataset to propograte through both networks

In [None]:
order1 = list(df1.columns)
order2 = list(df2.columns)
order2[1] = 'mileage'
order3 = list(df3.columns)

# In direction o1 -> o2
def make_order_conversion(o1, o2):
  return_dict = {}
  for index1, item1 in enumerate(o1):
    for index2, item2 in enumerate(o2):
      if item1 == item2:
        return_dict[index1] = index2
  return return_dict

three_to_two = make_order_conversion(order3, order2)
three_to_one = make_order_conversion(order3, order1)

In [None]:
def convert_input(order, frame1, frame2):
  converted_dataset = np.zeros((frame2.shape[0], frame1.shape[1]))
  for index, row in enumerate(frame2):
    for o in order.keys():
      converted_dataset[index, order[o]] = row[o]
  return converted_dataset

model1_input = convert_input(three_to_one, df1, df3.values)
model2_input = convert_input(three_to_two, df2, df3.values)

In [None]:
test_data1_ = torch.from_numpy(deepcopy(model1_input))
test_data2_ = torch.from_numpy(deepcopy(model2_input))
test_data1_[:,1] = test_data1_[:,1]/(dataset1.mileage.max())
test_data2_[:,1] = test_data2_[:,1]/(dataset2.odometer.max())

test_X1 = test_data1_[:,1:]
test_y1 = test_data1_[:,0]

test_X2 = test_data2_[:,1:]
test_y2 = test_data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [test_y1, test_y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

batchsize = 32
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

testloader1 = list(zip(np.array_split(test_X1, test_X1.shape[0]//batchsize),
                        np.array_split(test_y1, test_y1.shape[0]//batchsize)))
testloader2 = list(zip(np.array_split(test_X2, test_X2.shape[0]//batchsize),
                        np.array_split(test_y2, test_y2.shape[0]//batchsize)))

In [None]:
np.save(folder_path + 'model1_input', model1_input)
np.save(folder_path + 'model2_input', model2_input)

### 2.4.3 Test models

In [None]:
def test(model, loader):
  incorrect = 0
  total = 0
  for i, data in enumerate(loader, 0):
      # get the inputs; data is a list of [inputs, labels]
      inputs, labels = data
      labels = labels.reshape(-1, 1)
      outputs = torch.round(model(inputs))
      incorrect += torch.sum(torch.abs(labels - outputs))
      total += outputs.size()[0]
  return (total-incorrect)/(total)

In [None]:
print(test(model1, testloader1))
print(test(model2, testloader2))

tensor(0.6098, dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.8604, dtype=torch.float64, grad_fn=<DivBackward0>)


## 2.5 Save models

In [None]:
model1_name = 'model1.h5'
model2_name = 'model2.h5'

torch.save(model1.state_dict(), folder_path + model1_name)
torch.save(model2.state_dict(), folder_path + model2_name)

# 3. Generate counterfactuals

## 3.1 Find data to make counterfactuals from

### 3.1.1 Configure data conversion so that models can accept the other's data

In [None]:
two_to_one = make_order_conversion(order2, order1)
one_to_two = make_order_conversion(order1, order2)

two_converted = convert_input(one_to_two, df2, df1.values)
one_converted = convert_input(two_to_one, df1, df2.values)

### 3.1.2 Find data using convetrted sets

In [None]:
data1_ = torch.from_numpy(deepcopy(data1))
data2_ = torch.from_numpy(deepcopy(data2))

concatenated_data1 = np.concatenate((data1_, one_converted), axis=0)
concatenated_data2 = np.concatenate((two_converted, data2_), axis=0)

In [None]:
data1_ = torch.from_numpy(deepcopy(concatenated_data1))
data2_ = torch.from_numpy(deepcopy(concatenated_data2))
X1 = data1_[:,1:]
y1 = data1_[:,0]

X2 = data2_[:,1:]
y2 = data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [y1, y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mloader1 = list(zip(np.array_split(X1, X1.shape[0]//batchsize),
                        np.array_split(y1, X1.shape[0]//batchsize)))
mloader2 = list(zip(np.array_split(X2, X2.shape[0]//batchsize),
                        np.array_split(y2, y2.shape[0]//batchsize)))

In [None]:
def find_M(model1, model2, loader1, loader2):
  M = [[],[]]
  for i, data1 in enumerate(loader1, 0):
      # get the inputs; data is a list of [inputs, labels]
      inputs1, labels1 = data1
      inputs2, labels2 = loader2[i]
      labels1 = labels1.reshape(-1, 1)
      outputs1 = torch.round(model1(inputs1))
      outputs2 = torch.round(model2(inputs2))
      incorrect1 = torch.abs(labels1 - outputs1)
      incorrect2 = torch.abs(labels1 - outputs2)
      incorrect_diff = (incorrect1 - incorrect2).cpu().detach().numpy()
      for j, el in enumerate(incorrect_diff):
        if el > 0:
          M[0].append(inputs2[j].cpu().detach().numpy())
        if el < 0:
          M[1].append(inputs1[j].cpu().detach().numpy())
  return M

M = find_M(model1, model2, mloader1, mloader2)

### 3.1.3 Save Data

In [None]:
np.savez(folder_path+'M', M[0], M[1])

## 3.2 Generate counterfactuals

In [None]:
def generate_counterfactual(model, x, y_prime, epsilon=0.001, lambda_=0.00001,
                            max_iters = 500):
  x = torch.tensor(x, requires_grad=True)
  y_prime = torch.tensor(y_prime)
  x_original = deepcopy(x)
  x.retain_grad()
  optimizer = optim.Adam([x], lr=0.00001)
  criterion = nn.MSELoss()
  for i in range(max_iters):
    optimizer.zero_grad()
    out = model(x)
    if torch.abs(out - y_prime) <= epsilon:
      break
    loss = criterion(out, y_prime) + lambda_ * torch.abs(torch.sum((x_original - x)))
    loss.backward()
    optimizer.step()
  if torch.all(x.eq(x_original)):
    return None
  return x

one_to_three = {v: k for k, v in three_to_one.items()}
two_to_three = {v: k for k, v in three_to_two.items()}

one_to_three.pop(max(list(one_to_three.keys())))
two_to_three.pop(max(list(two_to_three.keys())))

augmented_instances = [[],[]]
diffs = [[[],[]],[[],[]]]
conversion_dicts = [two_to_three, one_to_three]
successful_cfs = 0
unsuccessful_cfs = 0

for i, model in enumerate([model2, model1]):
  for j, instance in enumerate(M[i]):
    output = model(torch.tensor(instance))
    class_ = torch.round(output)
    y_prime = output + (class_ - output)/2
    cf = generate_counterfactual(model, instance, y_prime)
    if cf is None:
      unsuccessful_cfs += 1
    else:
      augmented_instances[i].append(cf.detach().numpy())
      diff = convert_input(conversion_dicts[i], df3, np.asarray([cf.detach().numpy() - instance]))
      diffs[i][int(class_)].append(diff)
      successful_cfs += 1
  print('Counterfactual success ratio: ', successful_cfs/(successful_cfs+unsuccessful_cfs))


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



Counterfactual success ratio:  0.0961518884251247
Counterfactual success ratio:  0.16376634371178628


In [None]:
diff_averages = [[np.sum(diffs[0][0], axis=0)/len(diffs[0][0]),
                  np.sum(diffs[0][1], axis=0)/len(diffs[0][1])],
                 [np.sum(diffs[1][0], axis=0)/len(diffs[1][0]),
                  np.sum(diffs[1][1], axis=0)/len(diffs[1][1])]]

In [None]:
import plotly.express as px

In [None]:
from sklearn.manifold import TSNE

#trans1 = convert_input(one_to_three, df3, np.asarray(df1.values))
trans1 = df1.values[:,1:]
trans2 = convert_input(two_to_one, df1, np.asarray(df2.values))[:,1:]
cf1s = convert_input(two_to_one, df1, np.asarray(augmented_instances[0]))[:,1:]
#cf2s = convert_input(one_to_three, df3, np.asarray(augmented_instances[1]))
cf2s = np.asarray(augmented_instances[1])
TSNE_input = np.concatenate((trans1, trans2, cf1s, cf2s))
colors = ['Original 1'] * trans1.shape[0] + ['Original 2'] * trans2.shape[0] + ['Counterfactuals 1'] * cf1s.shape[0] + ['Counterfactuals 2'] * cf2s.shape[0]
colors = np.asarray(colors).reshape(-1, 1)
TSNE_output = TSNE(n_components=2).fit_transform(TSNE_input)
concat = np.concatenate((TSNE_output, colors), axis=1)
df_plot = pd.DataFrame(concat, columns = ['x1','x2','color'])
fig = px.scatter(df_plot, x='x1', y='x2', color='color')
fig.show()

In [None]:
augmented_instances_ = deepcopy(augmented_instances)

In [None]:
np.savez(folder_path+'Augmented_instances', augmented_instances_[0],
         augmented_instances_[1])

## 3.3 Augment Counterfactuals For Other Model

In [None]:
two_converted_cfs = convert_input(one_to_two, df2, np.asarray(augmented_instances[1]))
one_converted_cfs = convert_input(two_to_one, df1, np.asarray(augmented_instances[0]))

# 4. Train with Generated Instances

## 4.1 Data preparation & Training

In [None]:
one_converted_cfs_ = deepcopy(one_converted_cfs)
two_converted_cfs_ = deepcopy(two_converted_cfs)

#one_converted_cfs_[:,1] = one_converted_cfs_[:,1] * dataset2.odometer.max() / dataset1.mileage.max()
two_converted_cfs_[:,1] = two_converted_cfs_[:,1] * dataset1.mileage.max() / dataset2.odometer.max()

data1_ = torch.from_numpy(np.concatenate((deepcopy(data1), one_converted_cfs_)))
data2_ = torch.from_numpy(np.concatenate((deepcopy(data2), two_converted_cfs_)))

shuffle(data1_)
shuffle(data2_)

X1 = data1_[:,1:]
y1 = data1_[:,0]

X2 = data2_[:,1:]
y2 = data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [y1, y2]:
  for i, y in enumerate(Y):
    if y == 1:
      continue
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33,
                                                        random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33,
                                                        random_state=0)

batchsize = 32
lr = 1.0
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainloader1 = list(zip(np.array_split(X1_train, X1_train.shape[0]//batchsize),
                        np.array_split(y1_train, X1_train.shape[0]//batchsize)))
trainloader2 = list(zip(np.array_split(X2_train, X2_train.shape[0]//batchsize),
                        np.array_split(y2_train, X2_train.shape[0]//batchsize)))
testloader1 = list(zip(np.array_split(X1_test, X1_test.shape[0]//batchsize),
                       np.array_split(y1_test, X1_test.shape[0]//batchsize)))
testloader2 = list(zip(np.array_split(X2_test, X2_test.shape[0]//batchsize),
                       np.array_split(y2_test, X2_test.shape[0]//batchsize)))

In [None]:
model1_augmented_ = Net1()
model2_augmented_ = Net2()

optimizer1 = optim.Adam(model1_augmented_.parameters(), lr=0.00001)
optimizer2 = optim.Adam(model2_augmented_.parameters(), lr=0.001)

criterion1 = nn.BCELoss()
criterion2 = nn.BCELoss()

In [None]:
train(model1_augmented_, device, trainloader1, testloader1, optimizer1, 1000, criterion1,
      underperformance_threshold=100)

Validation set accuracy on epoch  0 :  tensor(0.5514, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  1 :  tensor(0.6440, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  2 :  tensor(0.6596, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  3 :  tensor(0.6450, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  4 :  tensor(0.6413, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  5 :  tensor(0.6697, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  6 :  tensor(0.6569, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  7 :  tensor(0.6661, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  8 :  tensor(0.6670, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  9 :  tensor(0.6633, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation

In [None]:
train(model2_augmented_, device, trainloader2, testloader2, optimizer2, 1000, criterion2)

Validation set accuracy on epoch  0 :  tensor(0.9350, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  1 :  tensor(0.9526, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  2 :  tensor(0.9613, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  3 :  tensor(0.9667, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  4 :  tensor(0.9699, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  5 :  tensor(0.9708, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  6 :  tensor(0.9741, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  7 :  tensor(0.9749, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  8 :  tensor(0.9749, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  9 :  tensor(0.9727, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation

## 4.2 Test Augmented Models

In [None]:
test_data1_ = torch.from_numpy(deepcopy(model1_input))
test_data2_ = torch.from_numpy(deepcopy(model2_input))
test_data1_[:,1] = test_data1_[:,1]/(dataset1.mileage.max())
test_data2_[:,1] = test_data2_[:,1]/(dataset2.odometer.max())

test_X1 = test_data1_[:,1:]
test_y1 = test_data1_[:,0]

test_X2 = test_data2_[:,1:]
test_y2 = test_data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [test_y1, test_y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

batchsize = 32
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

testloader1 = list(zip(np.array_split(test_X1, test_X1.shape[0]//batchsize),
                        np.array_split(test_y1, test_y1.shape[0]//batchsize)))
testloader2 = list(zip(np.array_split(test_X2, test_X2.shape[0]//batchsize),
                        np.array_split(test_y2, test_y2.shape[0]//batchsize)))

In [None]:
print(test(model1_augmented_, testloader1))
print(test(model2_augmented_, testloader2))

tensor(0.6868, dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.8764, dtype=torch.float64, grad_fn=<DivBackward0>)


## 4.3 Save Models

In [None]:
model1_name = 'model1_augmented.h5'
model2_name = 'model2_augmented.h5'

torch.save(model1_augmented_.state_dict(), folder_path + model1_name)
torch.save(model2_augmented_.state_dict(), folder_path + model2_name)

# 5. Conclusion

The experiment demonstrates that augmenting the training sets via counterfactuals is able to improve the performance of both networks. Not only is the higher perofrmance model able to transfer it's knowledge to improve the performance of the lower peroformance model, but also the lower performance model is able to improve the high performance model.

# 6. Experiment Two: Transfering Knowledge Between Different Algorithms

## 6.1 Prepare Data

In [None]:
def test_DTree(model, data, labels):
  predictions = model.predict(data)
  incorrect = np.sum(np.absolute(predictions-np.asarray(labels)))
  total = len(data)
  return (total-incorrect)/(total)

In [None]:
data1_ = torch.from_numpy(deepcopy(data1))
data2_ = torch.from_numpy(deepcopy(data2))
X1 = data1_[:,1:]
y1 = data1_[:,0]

X2 = data2_[:,1:]
y2 = data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [y1, y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33,
                                                        random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33,
                                                        random_state=0)

batchsize = 32
lr = 1.0
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainloader2 = list(zip(np.array_split(X2_train, X2_train.shape[0]//batchsize),
                        np.array_split(y2_train, X2_train.shape[0]//batchsize)))
testloader2 = list(zip(np.array_split(X2_test, X2_test.shape[0]//batchsize),
                       np.array_split(y2_test, X2_test.shape[0]//batchsize)))

## 6.2 Train models

### 6.2.1 Train Decision Tree

In [None]:
model1 = DecisionTreeRegressor(min_samples_leaf=20).fit(X1_train, y1_train)
test_DTree(model1, X1_test, y1_test)

0.7188279000897236

### 6.2.2 Train Network

In [None]:
model2 = Net2()
model2.load_state_dict(torch.load(model2_fp))

<All keys matched successfully>

## 6.3 Test Models

In [None]:
test_data1_ = torch.from_numpy(deepcopy(model1_input))
test_data2_ = torch.from_numpy(deepcopy(model2_input))
test_data1_[:,1] = test_data1_[:,1]/(dataset1.mileage.max())
test_data2_[:,1] = test_data2_[:,1]/(dataset2.odometer.max())

test_X1 = test_data1_[:,1:]
test_y1 = test_data1_[:,0]

test_X2 = test_data2_[:,1:]
test_y2 = test_data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [test_y1, test_y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

batchsize = 32
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

testloader2 = list(zip(np.array_split(test_X2, test_X2.shape[0]//batchsize),
                        np.array_split(test_y2, test_y2.shape[0]//batchsize)))

In [None]:
print(test_DTree(model1, test_X1, test_y1))

0.6340663344686485


## 6.4 Generate Counterfactuals

### 6.4.1 Counterfactual code

In [None]:
def generate_counterfactual(model, x, y_prime, epsilon=0.01, lambda_=0.0001,
                            max_iters = 500):
  x = torch.tensor(x, requires_grad=True)
  y_prime = torch.tensor(y_prime)
  x_original = deepcopy(x)
  x.retain_grad()
  optimizer = optim.Adam([x], lr=0.00001)
  criterion = nn.MSELoss()
  for i in range(max_iters):
    optimizer.zero_grad()
    out = model(x)
    if torch.abs(out - y_prime) <= epsilon:
      break
    loss = criterion(out, y_prime) + lambda_ * torch.sum(torch.abs((x_original - x)))
    loss = criterion(out, y_prime)
    loss.backward()
    optimizer.step()
  if torch.all(x.eq(x_original)):
    return None
  return x

def generate_DTree_counterfactual(model, x, y_prime, epsilon=0.01, lambda_=0.0001,
                                  max_iters = 500,
                                  options = {'c1': 0.5, 'c2': 0.3, 'w':0.9},
                                  n_particles=100):
  optimizer = ps.single.GlobalBestPSO(n_particles=n_particles,
                                      dimensions=x.shape[0], options=options,
                                      bounds = [np.zeros(x.shape[0]),
                                                np.ones(x.shape[0])])
  kwargs = {'original_x':deepcopy(x), 'model':model, 'y_prime':y_prime,
            'lambda_':lambda_}
  cost, pos = optimizer.optimize(counterfactual_cost, iters=1000, verbose=False,**kwargs)
  return pos

def counterfactual_cost(x, original_x, model, y_prime, lambda_):
  y = model.predict(x)
  loss = abs(y - y_prime) + lambda_ * np.sum(np.absolute(x-original_x))
  return loss

### 6.4.2 Find M Matrix

In [None]:
def convert_input(order, frame1, frame2):
  converted_dataset = np.zeros((frame2.shape[0], frame1.shape[1]))
  for index, row in enumerate(frame2):
    for o in order.keys():
      converted_dataset[index, order[o]] = row[o]
  return converted_dataset

# In direction o1 -> o2
def make_order_conversion(o1, o2):
  return_dict = {}
  for index1, item1 in enumerate(o1):
    for index2, item2 in enumerate(o2):
      if item1 == item2:
        return_dict[index1] = index2
  return return_dict

In [None]:
order1 = list(df1.columns)
order2 = list(df2.columns)

print(order1)
print(order2)

['price', 'mileage', 'nj', 'tn', 'ga', 'va', 'fl', 'tx', 'ca', 'nc', 'oh', 'ny', 'pa', 'sc', 'mi', 'wa', 'az', 'ky', 'ma', 'ne', 'on', 'mo', 'mn', 'ct', 'ar', 'co', 'il', 'ms', 'md', 'ut', 'wi', 'ok', 'or', 'in', 'wv', 'nv', 'ks', 'ri', 'la', 'al', 'nm', 'id', 'nh', 'mt', 'vt', ' usa', ' canada', 'black', 'silver', 'blue', 'red', 'white', 'gray', 'orange', 'brown', 'no_color', 'gold', 'charcoal', 'turquoise', 'beige', 'green', 'dark blue', 'phantom black', 'yellow', 'color:', 'toreador red', 'bright white clearcoat', 'billet silver metallic clearcoat', 'black clearcoat', 'jazz blue pearlcoat', 'purple', 'ruby red metallic tinted clearcoat', 'triple yellow tri-coat', 'competition orange', 'off-white', 'shadow black', 'magnetic metallic', 'ingot silver metallic', 'ruby red', 'royal crimson metallic tinted clearcoat', 'kona blue metallic', 'oxford white', 'lightning blue', 'ingot silver', 'white platinum tri-coat metallic', 'guard', 'tuxedo black metallic', 'tan', 'burgundy', 'super black

In [None]:
order1 = list(df1.columns)
order2 = list(df2.columns)

two_to_one = make_order_conversion(order2, order1)
one_to_two = make_order_conversion(order1, order2)

two_converted = convert_input(one_to_two, df2, df1.values)
one_converted = convert_input(two_to_one, df1, df2.values)

data1_ = torch.from_numpy(deepcopy(data1))
data2_ = torch.from_numpy(deepcopy(data2))

concatenated_data1 = np.concatenate((data1_, one_converted), axis=0)
concatenated_data2 = np.concatenate((two_converted, data2_), axis=0)

data1_ = torch.from_numpy(deepcopy(concatenated_data1))
data2_ = torch.from_numpy(deepcopy(concatenated_data2))
X1 = data1_[:,1:]
y1 = data1_[:,0]

X2 = data2_[:,1:]
y2 = data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [y1, y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mloader1 = [X1, y1]
mloader2 = list(zip(np.array_split(X2, X2.shape[0]//batchsize),
                        np.array_split(y2, y2.shape[0]//batchsize)))

In [None]:
def find_M(model1, model2, loader1, loader2, X2):
  M = [[],[]]
  inputs1 = loader1[0]
  outputs1 = model1.predict(loader1[0])
  incorrect1 = np.absolute(outputs1-np.asarray(loader1[1]))
  incorrect2 = None
  inputs2_ = None
  for i, data1 in enumerate(loader2, 0):
      # get the inputs; data is a list of [inputs, labels]
      inputs2, labels2 = loader2[i]
      labels2 = labels2.reshape(-1,1)
      outputs2 = torch.round(model2(inputs2)).reshape(-1,1)
      if incorrect2 is None:
        incorrect2 = torch.abs(labels2 - outputs2).cpu().detach().numpy().reshape(-1,1)
        inputs2_ = inputs2.cpu().detach().numpy()
      else:
        incorrect2 = np.concatenate((incorrect2,
                      torch.abs(labels2 - outputs2).cpu().detach().numpy().reshape(-1,1)))
  incorrect2 = list(incorrect2)
  for j, el_ in enumerate(incorrect2):
    el = incorrect1[j] - el_
    if el > 0:
      M[0].append(X2[j].cpu().detach().numpy())
    if el < 0:
      M[1].append(inputs1[j].cpu().detach().numpy())
  return M

M = find_M(model1, model2, mloader1, mloader2, X2)

### 6.4.3 Save M Matrix

In [None]:
np.savez(folder_path+'M2', M[0], M[1])

### 6.4.4 Generate Counterfactuals

In [None]:
cf_algo = [generate_counterfactual, generate_DTree_counterfactual]
augmented_instances = [[],[]]
successful_cfs = 0
unsuccessful_cfs = 0

for i, model in enumerate([model2, model1]):
  algo = cf_algo[i]
  for j, instance in enumerate(M[i]):
    if i == 0:
      output = model(torch.tensor(instance))
      class_ = torch.round(output)
    else:
      output = model.predict(instance.reshape(1, -1))
      class_ = np.round(output)
    y_prime = output + (class_ - output)/2
    cf = algo(model, instance, y_prime)
    if cf is None:
      unsuccessful_cfs += 1
    else:
      augmented_instances[i].append(cf)
      successful_cfs += 1
  print('Counterfactual success ratio: ', successful_cfs/(successful_cfs+unsuccessful_cfs))

  after removing the cwd from sys.path.


Counterfactual success ratio:  0.02758179157520644
Counterfactual success ratio:  0.05913808578688545


In [None]:
np.savez(folder_path+'Augmented_instances2_', augmented_instances[0],
         augmented_instances[1])

## 6.5 Train With Counterfactuals

In [None]:
order1 = ['price', 'mileage', 'nj', 'tn', 'ga', 'va', 'fl', 'tx', 'ca', 'nc', 'oh', 'ny', 'pa', 'sc', 'mi', 'wa', 'az', 'ky', 'ma', 'ne', 'on', 'mo', 'mn', 'ct', 'ar', 'co', 'il', 'ms', 'md', 'ut', 'wi', 'ok', 'or', 'in', 'wv', 'nv', 'ks', 'ri', 'la', 'al', 'nm', 'id', 'nh', 'mt', 'vt', ' usa', ' canada', 'black', 'silver', 'blue', 'red', 'white', 'gray', 'orange', 'brown', 'no_color', 'gold', 'charcoal', 'turquoise', 'beige', 'green', 'dark blue', 'phantom black', 'yellow', 'color:', 'toreador red', 'bright white clearcoat', 'billet silver metallic clearcoat', 'black clearcoat', 'jazz blue pearlcoat', 'purple', 'ruby red metallic tinted clearcoat', 'triple yellow tri-coat', 'competition orange', 'off-white', 'shadow black', 'magnetic metallic', 'ingot silver metallic', 'ruby red', 'royal crimson metallic tinted clearcoat', 'kona blue metallic', 'oxford white', 'lightning blue', 'ingot silver', 'white platinum tri-coat metallic', 'guard', 'tuxedo black metallic', 'tan', 'burgundy', 'super black', 'cayenne red', 'morningsky blue', 'pearl white', 'glacier white', 'clean vehicle', 'salvage insurance', 2008, 2011, 2018, 2014, 2010, 2017, 2009, 2013, 2015, 2020, 2016, 1973, 2003, 2019, 2002, 2000, 2001, 2005, 2012, 2006, 2007, 2004, 1994, 1998, 1999, 1984, 'toyota', 'ford', 'dodge', 'chevrolet', 'gmc', 'chrysler', 'kia', 'buick', 'infiniti', 'mercedes-benz', 'jeep', 'bmw', 'cadillac', 'hyundai', 'heartland', 'jaguar', 'acura', 'honda', 'harley-davidson', 'audi', 'lincoln', 'lexus', 'nissan', 'land', 'maserati', 'peterbilt', 'ram', 'mazda', 'cruiser', 'se', 'mpv', 'door', '1500', 'pk', 'malibu', 'coupe', 'wagon', 'forte', 'encore', 'sorento', 'doors', 'chassis', 'q70', 'camaro', 'convertible', 'vans', 'srw', 'compass', 'enclave', '300', 'cherokee', 'pacifica', 'x3', 'equinox', 'challenger', 'm', 'colorado', 'focus', 'durango', 'escape', 'charger', 'explorer', 'f-150', '3500', 'caravan', 'van', 'dart', '2500', 'esv', 'cutaway', 'el', 'edge', 'series', 'flex', 'srx', 'cab', 'pickup', 'vehicl', 'trax', 'tahoe', 'suburban', 'cargo', 'drw', 'fiesta', 'impala', 'soul', 'elantra', 'pioneer', 'trail', 'traverse', 'country', 'road/street', 'nautilus', 'gx', 'q5', 'gle', 'sportage', '5', 'sport', 'discovery', 'acadia', 'ghibli', 'glc', 'e-class', 'truck', 'utility', 'limited', 'sl-class', 'cx-3', '2500hd', 'sonic', 'corvette', 'mdx', 'xt5', 'fusion', 'mustang', 'passenger', 'volt', 'spark', 'cruze', 'ld', 'journey', 'transit', 'ranger', 'taurus', 'max', 'energi', 'expedition', 'bus', 'ecosport', 'f-750', 'd', 'dr', 'hybrid', 'suv', 'connect', 'f-650', 'sentra', 'altima', 'frontier', 'rogue', 'maxima', 'versa', 'note', 'armada', 'pathfinder', 'titan', 'sedan', 'juke', 'murano', 'xterra', 'kicks', 'xd', 'nvp']
order2 = ['price', 'mileage', 'al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'nc', 'ne', 'nv', 'nj', 'nm', 'ny', 'nh', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy', 'black', 'silver', 'blue', 'red', 'white', 'gray', 'orange', 'brown', 'no_color', 'gold', 'charcoal', 'turquoise', 'beige', 'green', 'dark blue', 'phantom black', 'yellow', 'color:', 'toreador red', 'bright white clearcoat', 'billet silver metallic clearcoat', 'black clearcoat', 'jazz blue pearlcoat', 'purple', 'ruby red metallic tinted clearcoat', 'triple yellow tri-coat', 'competition orange', 'off-white', 'shadow black', 'magnetic metallic', 'ingot silver metallic', 'ruby red', 'royal crimson metallic tinted clearcoat', 'kona blue metallic', 'oxford white', 'lightning blue', 'ingot silver', 'white platinum tri-coat metallic', 'guard', 'tuxedo black metallic', 'tan', 'burgundy', 'super black', 'cayenne red', 'morningsky blue', 'pearl white', 'glacier white', 'grey', 'custom', 'automatic', 'manual', 'other', 'clean vehicle', 'salvage insurance', 'clean', 'rebuilt', 'lien', 'salvage', 'missing', 'parts only', 2008, 2011, 2018, 2014, 2010, 2017, 2009, 2013, 2015, 2020, 2016, 1973, 2003, 2019, 2002, 2000, 2001, 2005, 2012, 2006, 2007, 2004, 1994, 1998, 1999, 1984, 1996.0, 1976.0, 1988.0, 1991.0, 1968.0, 1986.0, 2021.0, 1990.0, 1965.0, 1937.0, 1997.0, 1982.0, 1948.0, 1939.0, 1992.0, 1985.0, 1995.0, 1955.0, 1947.0, 1979.0, 1966.0, 1936.0, 1964.0, 1983.0, 1956.0, 1962.0, 1958.0, 1989.0, 1971.0, 1981.0, 1978.0, 1987.0, 1993.0, 1977.0, 1951.0, 1980.0, 1933.0, 1970.0, 1934.0, 1969.0, 1963.0, 1946.0, 1960.0, 1957.0, 1932.0, 1967.0, 1928.0, 1972.0, 1961.0, 1954.0, 1940.0, 1949.0, 1975.0, 1923.0, 1929.0, 1974.0, 1959.0, 1953.0, 1930.0, 1941.0, 1931.0, 1938.0, 1950.0, 1952.0, 1927.0, 1926.0, 1919.0, 1921.0, 1916.0, 1924.0, 2022.0, 1942.0, 1925.0, 1945.0, 'toyota', 'ford', 'dodge', 'chevrolet', 'gmc', 'chrysler', 'kia', 'buick', 'infiniti', 'mercedes-benz', 'jeep', 'bmw', 'cadillac', 'hyundai', 'heartland', 'jaguar', 'acura', 'honda', 'harley-davidson', 'audi', 'lincoln', 'lexus', 'nissan', 'land', 'maserati', 'peterbilt', 'ram', 'mazda', 'subaru', 'alfa-romeo', 'mitsubishi', 'volkswagen', 'saturn', 'mini', 'tesla', 'fiat', 'volvo', 'pontiac', 'mercury', 'porsche', 'rover', 'datsun', 'ferrari', 'land rover', 'aston-martin', 'morgan', 'town & country', 'frontier', 'grand cherokee laredo', 'civic', 'outback', 'liberty', 'tahoe', 'camry', 'maxima', 'silverado', 'enclave', 'f-150', 'explorer xlt', 'cts', '1500', 'silverado 1500', 'optima', 'accord', 'super duty f-550 drw', '2500', 'super duty f-450 drw', 'econoline cargo van', 'super duty f-350 drw', 'super duty f-250 srw', 'transit', 'sierra 1500', 'soul', 'corolla le', 'camry le', 'odyssey', 'escape', 'patriot', 'corolla', 'prius', 'mustang', 'tacoma', 'highlander', 'cruze', 'mdx', 'f250 super duty', 'taurus', 'sierra', 'sierra 2500hd', 'cr-v', 'camaro', 'tundra', 'express cargo van', 'rav4', 'tl', 'focus se', '4runner', 'altima', 'passat', 'murano', 'edge', 'equinox', 'suburban', 'grand cherokee', 'fusion se', 'sonata', 'ranger', 'f150', 'wrangler', 'corvette', 'grand caravan', 'a4', 'altima 2.5 s', 'yukon', 'accord ex', 'beetle', 'impala', 'rx 350', 'sentra', 'f350', 'civic lx', 'expedition', '3500', 'focus', 'santa fe', 'silverado 2500hd', 'mkz', 'durango', 'elantra', 'pathfinder', 'avalanche', '3', 'equinox lt', 'fusion', '328i', 'impala lt', 'sienna', 'jetta', '200', 'escape xlt', 'acadia', 'pt cruiser', 'cherokee', 'charger', 'trailblazer', 'traverse', 'forester', 'malibu', 'x5', 'dakota', 'malibu lt', 'accord ex-l', 'f-250', 'f-350', 'sienna le', 'rogue', 'f250', 'f350 super duty', 'fit', 'cruze lt', 'deville', 'colorado', 'impreza', 'accord lx', 'f-250 super duty', 'escalade', 'pilot', 'explorer', 'challenger', 'versa', 'wrangler unlimited', 'sorento', 'legacy', 'civic ex', 'gas', 'diesel', 'hybrid', 'electric', 'mid-size', 'full-size', 'compact', 'sub-compact', 'mini-van', 'offroad', 'truck', 'SUV', 'coupe', 'sedan', 'wagon', 'convertible', 'hatchback', 'pickup', 'van', 'bus', 'excellent', 'good', 'like new', 'fair', 'new', '6 cylinders', '4 cylinders', '8 cylinders', '10 cylinders', '3 cylinders', '5 cylinders', '12 cylinders', 'birmingham', 'anchorage / mat-su', 'phoenix', 'tucson', 'fayetteville', 'little rock', 'bakersfield', 'fresno / madera', 'inland empire', 'los angeles', 'modesto', 'orange county', 'redding', 'reno / tahoe', 'sacramento', 'san diego', 'san luis obispo', 'SF bay area', 'stockton', 'ventura county', 'colorado springs', 'denver', 'fort collins / north CO', 'western slope', 'hartford', 'new haven', 'washington, DC', 'delaware', 'daytona beach', 'ft myers / SW florida', 'jacksonville', 'lakeland', 'ocala', 'orlando', 'sarasota-bradenton', 'south florida', 'space coast', 'tallahassee', 'tampa bay area', 'treasure coast', 'albany', 'atlanta', 'columbus', 'hawaii', 'boise', 'east idaho', "spokane / coeur d'alene", 'chicago', 'rockford', 'springfield', 'st louis, MO', 'indianapolis', 'richmond', 'south bend / michiana', 'des moines', 'omaha / council bluffs', 'kansas city, MO', 'wichita', 'lexington', 'louisville', 'new orleans', 'maine', 'baltimore', 'boston', 'western massachusetts', 'worcester / central MA', 'detroit metro', 'grand rapids', 'northern michigan', 'duluth / superior', 'fargo / moorhead', 'minneapolis / st paul', 'rochester', 'st cloud', 'billings', 'asheville', 'charlotte', 'greensboro', 'raleigh / durham / CH', 'las vegas', 'central NJ', 'jersey shore', 'north jersey', 'south jersey', 'albuquerque', 'buffalo', 'hudson valley', 'long island', 'new york city', 'syracuse', 'new hampshire', 'akron / canton', 'cincinnati', 'cleveland', 'dayton / springfield', 'oklahoma city', 'tulsa', 'bend', 'eugene', 'medford-ashland', 'portland', 'salem', 'lehigh valley', 'philadelphia', 'pittsburgh', 'scranton / wilkes-barre', 'rhode island', 'charleston', 'greenville / upstate', 'knoxville', 'memphis', 'nashville', 'austin', 'dallas / fort worth', 'el paso', 'houston', 'mcallen / edinburg', 'san antonio', 'vermont', 'fredericksburg', 'norfolk / hampton roads', 'roanoke', 'seattle-tacoma', 'appleton-oshkosh-FDL', 'eau claire', 'green bay', 'madison', 'milwaukee', 'fwd', '4wd', 'rwd']

two_to_one = make_order_conversion(order2, order1)
one_to_two = make_order_conversion(order1, order2)

two_converted_cfs = convert_input(one_to_two, df2, np.asarray(augmented_instances[1]))
one_converted_cfs = convert_input(two_to_one, df1, np.asarray(augmented_instances[0]))

one_converted_cfs_ = deepcopy(one_converted_cfs)
two_converted_cfs_ = deepcopy(two_converted_cfs)

one_converted_cfs_[:,1] = one_converted_cfs_[:,1] * dataset2.odometer.max() / dataset1.mileage.max()
two_converted_cfs_[:,1] = two_converted_cfs_[:,1] * dataset1.mileage.max() / dataset2.odometer.max()

two_converted_cfs_[:,1:] = (two_converted_cfs_[:,1:] - np.amin(two_converted_cfs_[:,1:])) / (np.amax(two_converted_cfs_[:,1:]) - np.amin(two_converted_cfs_[:,1:]))

data1_ = torch.from_numpy(np.concatenate((deepcopy(data1), one_converted_cfs_)))
data2_ = torch.from_numpy(np.concatenate((deepcopy(data2), two_converted_cfs_)))

shuffle(data1_)
shuffle(data2_)

X1 = data1_[:,1:]
y1 = data1_[:,0]

X2 = data2_[:,1:]
y2 = data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [y1, y2]:
  for i, y in enumerate(Y):
    if y == 1:
      continue
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33,
                                                        random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33,
                                                        random_state=0)

batchsize = 32
lr = 1.0
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainloader2 = list(zip(np.array_split(X2_train, X2_train.shape[0]//batchsize),
                        np.array_split(y2_train, X2_train.shape[0]//batchsize)))
testloader2 = list(zip(np.array_split(X2_test, X2_test.shape[0]//batchsize),
                       np.array_split(y2_test, X2_test.shape[0]//batchsize)))

In [None]:
model1_augmented = DecisionTreeClassifier().fit(X1_train, y1_train)
test_DTree(model1_augmented, X1_test, y1_test)

0.9504587155963303

In [None]:
model2_augmented_ = Net2()

optimizer2 = optim.Adam(model2_augmented_.parameters(), lr=0.001)

criterion2 = nn.BCELoss()

train(model2_augmented_, device, trainloader2, testloader2, optimizer2, 1000, criterion2)

Validation set accuracy on epoch  0 :  tensor(0.9331, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  1 :  tensor(0.9471, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  2 :  tensor(0.9574, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  3 :  tensor(0.9618, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  4 :  tensor(0.9662, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  5 :  tensor(0.9696, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  6 :  tensor(0.9729, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  7 :  tensor(0.9740, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  8 :  tensor(0.9732, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation set accuracy on epoch  9 :  tensor(0.9746, dtype=torch.float64, grad_fn=<DivBackward0>)
Validation

## 6.6 Test New Models

In [None]:
test_data1_ = torch.from_numpy(deepcopy(model1_input))
test_data2_ = torch.from_numpy(deepcopy(model2_input))
test_data1_[:,1] = test_data1_[:,1]/(dataset1.mileage.max())
test_data2_[:,1] = test_data2_[:,1]/(dataset2.odometer.max())

test_X1 = test_data1_[:,1:]
test_y1 = test_data1_[:,0]

test_X2 = test_data2_[:,1:]
test_y2 = test_data2_[:,0]

# Descritize labels: < or >= $20,000
for Y in [test_y1, test_y2]:
  for i, y in enumerate(Y):
    if y >= 20000:
      Y[i] = 1
    else:
      Y[i] = 0

batchsize = 32
gamma = 0.7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

testloader2 = list(zip(np.array_split(test_X2, test_X2.shape[0]//batchsize),
                        np.array_split(test_y2, test_y2.shape[0]//batchsize)))

In [None]:
print(test_DTree(model1_augmented, test_X1, test_y1))
print(test(model2_augmented_, testloader2))

0.6757624398073836
tensor(0.8732, dtype=torch.float64, grad_fn=<DivBackward0>)


## 6.7 Save Augmented Models

In [None]:
model1_name = 'model1_augmented2.joblib'
model2_name = 'model2_augmented2.h5'

dump(model1_augmented, folder_path + model1_name)
torch.save(model2_augmented_.state_dict(), folder_path + model2_name)

# 7. Conclusion

The second experiment also demonstrate the ability of models of different algorithms to distill knowledge between eachother.