In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.decomposition import PCA

%matplotlib inline

In [30]:
data = pd.read_csv('data2.csv', delimiter=',')
data.head()

Unnamed: 0,RefId,IsBadBuy,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,...,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,1,0,12/7/2009,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,...,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
1,2,0,12/7/2009,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,...,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
2,3,0,12/7/2009,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,...,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
3,4,0,12/7/2009,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,...,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
4,5,0,12/7/2009,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,...,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72983 entries, 0 to 72982
Data columns (total 34 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   RefId                              72983 non-null  int64  
 1   IsBadBuy                           72983 non-null  int64  
 2   PurchDate                          72983 non-null  object 
 3   Auction                            72983 non-null  object 
 4   VehYear                            72983 non-null  int64  
 5   VehicleAge                         72983 non-null  int64  
 6   Make                               72983 non-null  object 
 7   Model                              72983 non-null  object 
 8   Trim                               70623 non-null  object 
 9   SubModel                           72975 non-null  object 
 10  Color                              72975 non-null  object 
 11  Transmission                       72974 non-null  obj

In [23]:
data['AuctionAve'].value_counts(dropna=False)

0.000        338
NaN          315
6459.250     270
6569.750     181
7405.750     138
            ... 
5944.750       1
13396.625      1
5014.125       1
7220.375       1
11248.375      1
Name: AuctionAve, Length: 43144, dtype: int64

In [31]:
data.drop(['RefId', 'VehYear', 'Model', 'Trim', 'SubModel', 'Color', 'WheelType', 'PRIMEUNIT', 'AUCGUART', 'VNZIP1', 'PurchDate'], axis=1, inplace=True)

In [32]:
data.head()

Unnamed: 0,IsBadBuy,Auction,VehicleAge,Make,Transmission,WheelTypeID,VehOdo,Nationality,Size,TopThreeAmericanName,...,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,BYRNO,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,0,ADESA,3,MAZDA,AUTO,1.0,89046,OTHER ASIAN,MEDIUM,OTHER,...,13600.0,7451.0,8552.0,11597.0,12409.0,21973,FL,7100.0,0,1113
1,0,ADESA,5,DODGE,AUTO,1.0,93593,AMERICAN,LARGE TRUCK,CHRYSLER,...,12572.0,7456.0,9222.0,11374.0,12791.0,19638,FL,7600.0,0,1053
2,0,ADESA,4,DODGE,AUTO,2.0,73807,AMERICAN,MEDIUM,CHRYSLER,...,8457.0,4035.0,5557.0,7146.0,8702.0,19638,FL,4900.0,0,1389
3,0,ADESA,5,DODGE,AUTO,1.0,65617,AMERICAN,COMPACT,CHRYSLER,...,5690.0,1844.0,2646.0,4375.0,5518.0,19638,FL,4100.0,0,630
4,0,ADESA,4,FORD,MANUAL,2.0,69367,AMERICAN,COMPACT,FORD,...,8707.0,3247.0,4384.0,6739.0,7911.0,19638,FL,4000.0,0,1020


In [33]:
auction_averages = ['MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
                        'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
                        'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
                        'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice']
                        
    
data['AuctionAve'] = sum(data[ave] for ave in auction_averages) /len(auction_averages)
data = data.drop(auction_averages, axis=1)

In [34]:
data.head()

Unnamed: 0,IsBadBuy,Auction,VehicleAge,Make,Transmission,WheelTypeID,VehOdo,Nationality,Size,TopThreeAmericanName,BYRNO,VNST,VehBCost,IsOnlineSale,WarrantyCost,AuctionAve
0,0,ADESA,3,MAZDA,AUTO,1.0,89046,OTHER ASIAN,MEDIUM,OTHER,21973,FL,7100.0,0,1113,10403.625
1,0,ADESA,5,DODGE,AUTO,1.0,93593,AMERICAN,LARGE TRUCK,CHRYSLER,19638,FL,7600.0,0,1053,9943.625
2,0,ADESA,4,DODGE,AUTO,2.0,73807,AMERICAN,MEDIUM,CHRYSLER,19638,FL,4900.0,0,1389,6100.25
3,0,ADESA,5,DODGE,AUTO,1.0,65617,AMERICAN,COMPACT,CHRYSLER,19638,FL,4100.0,0,630,3662.375
4,0,ADESA,4,FORD,MANUAL,2.0,69367,AMERICAN,COMPACT,FORD,19638,FL,4000.0,0,1020,5959.75


In [35]:
num_cols = ['VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost', 'AuctionAve']
nominal_cols = ['Auction', 'Make', 'Transmission', 'WheelTypeID', 'Nationality', 'Size', 'TopThreeAmericanName', 'BYRNO', 'VNST', 'IsOnlineSale']

In [36]:
for col in num_cols:
        data[col] = data[col].fillna(data[col].median())
    
for col in nominal_cols:
    mode = data[col].mode()[0]
    data[col] = data[col].fillna(mode)

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72983 entries, 0 to 72982
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   IsBadBuy              72983 non-null  int64  
 1   Auction               72983 non-null  object 
 2   VehicleAge            72983 non-null  int64  
 3   Make                  72983 non-null  object 
 4   Transmission          72983 non-null  object 
 5   WheelTypeID           72983 non-null  float64
 6   VehOdo                72983 non-null  int64  
 7   Nationality           72983 non-null  object 
 8   Size                  72983 non-null  object 
 9   TopThreeAmericanName  72983 non-null  object 
 10  BYRNO                 72983 non-null  int64  
 11  VNST                  72983 non-null  object 
 12  VehBCost              72983 non-null  float64
 13  IsOnlineSale          72983 non-null  int64  
 14  WarrantyCost          72983 non-null  int64  
 15  AuctionAve         

In [38]:
data = pd.get_dummies(data, columns = nominal_cols)

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72983 entries, 0 to 72982
Columns: 182 entries, IsBadBuy to IsOnlineSale_1
dtypes: float64(2), int64(4), uint8(176)
memory usage: 15.6 MB


In [40]:
X = data.copy()
X.drop('IsBadBuy', axis=1, inplace=True)
y = data['IsBadBuy']

In [41]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72983 entries, 0 to 72982
Columns: 181 entries, VehicleAge to IsOnlineSale_1
dtypes: float64(2), int64(3), uint8(176)
memory usage: 15.0 MB


In [44]:
pca = PCA(n_components=7)
pca.fit(X)
X = pca.transform(X)

In [45]:
def eucl_distance(x, y):
    x = np.array(x) 
    y = np.array(y)
    return np.sqrt(((x-y)**2).sum())

In [53]:
def get_neighbors(train_data, train_labels, inst, k):
    neigh = []
    for i in range(len(train_labels)):
        distance = eucl_distance(inst, train_data[i])
        neigh.append((i, distance, y[i]))
    neigh.sort(key=lambda n: n[1])
    neighbors = neigh[:k]
    return neighbors

In [47]:
def get_majority(neighbours):
    classes = [neighbour[2] for neighbour in neighbours]
    count = Counter(classes)
    return count.most_common()[0][0] 

In [48]:
def vote_distance_weights(neighbors):
    class_counter = Counter()
    number_of_neighbors = len(neighbors)
    for i in range(number_of_neighbors):
        dist = neighbors[i][1]
        label = neighbors[i][2]
        class_counter[label] += 1 / (dist**2 + 1)
    labels, votes = zip(*class_counter.most_common())
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    return winner, votes4winner / sum(votes)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
y_pred = []
for i in range(len(y_test)):
    neighbors = get_neighbors(X_train, y_train, X_test[i], 2)
    y_pred.append(get_majority(neighbors))

KeyboardInterrupt: 

In [None]:
count = np.count_nonzero(y_pred - y_test)
print(classification_report(y_test, y_pred))