## From Scratch

In [1]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
home = str(Path.home())

import pandas as pd
import numpy as np
from scipy.special import logsumexp
df = pd.read_csv(
    f"{home}/Project/UserCarData.csv",index_col=0
)

In [2]:
def bin_column(series):
    return pd.qcut(series, [0, .125, .25, .375, .5, .625, .75, .875, 1.0])

def compute_priors(y):
    priors = dict(y.value_counts(normalize=True))
    new_priors = {}
    for i in priors:
        new_priors[y.name + '=' + str(i)] = priors[i]
    return dict(sorted(new_priors.items()))

def specific_class_conditional(x,xv,y,yv):
    df = pd.concat([x, y], axis=1)
    try:
        prob = len(df.loc[(x == xv) & (y == yv)])/len(df[y==yv])
    except(ZeroDivisionError):
        prob = .5
    return prob

def class_conditional(X,y):
    probs = {}
    for col in X.columns:
        for val in np.sort(X[col].unique()):
            key = str(col) + "=" + str(val) + "|transmission=Manual"
            entry = specific_class_conditional(X[col], val, y, "Manual")
            probs[key] = entry
        
            key = str(col) + "=" + str(val) + "|transmission=Automatic"
            entry = specific_class_conditional(X[col], val, y, "Automatic")
            probs[key] = entry
    return probs

def posteriors(probs,priors,x):
    post_probs = {}
    try:
        a = probs["name=" + str(x["name"]) + "|transmission=Manual"]
        b = probs["year=" + str(x["year"]) + "|transmission=Manual"]
        c = probs["km_driven=" + str(x["km_driven"]) + "|transmission=Manual"]
        d = probs["fuel=" + str(x["fuel"]) + "|transmission=Manual"]
        e = probs["selling_price=" + str(x["selling_price"]) + "|transmission=Manual"]
        f = probs["seller_type=" + str(x["seller_type"]) + "|transmission=Manual"]
        g = probs["owner=" + str(x["owner"]) + "|transmission=Manual"]
        h = probs["mileage=" + str(x["mileage"]) + "|transmission=Manual"]
        i = probs["engine=" + str(x["engine"]) + "|transmission=Manual"]
        j = probs["max_power=" + str(x["max_power"]) + "|transmission=Manual"]

        k = probs["name=" + str(x["name"]) + "|transmission=Automatic"]
        l = probs["year=" + str(x["year"]) + "|transmission=Automatic"]
        m = probs["km_driven=" + str(x["km_driven"]) + "|transmission=Automatic"]
        n = probs["fuel=" + str(x["fuel"]) + "|transmission=Automatic"]
        o = probs["selling_price=" + str(x["selling_price"]) + "|transmission=Automatic"]
        p = probs["seller_type=" + str(x["seller_type"]) + "|transmission=Automatic"]
        q = probs["owner=" + str(x["owner"]) + "|transmission=Automatic"]
        r = probs["mileage=" + str(x["mileage"]) + "|transmission=Automatic"]
        s = probs["engine=" + str(x["engine"]) + "|transmission=Automatic"]
        t = probs["max_power=" + str(x["max_power"]) + "|transmission=Automatic"]

        numerator = a * b * c * d * e * f * g * h * i * j * priors["transmission=Manual"]

        denominator = numerator + k * l * m * n * o * p * q * r * s * t * priors["transmission=Automatic"]

        post_probs['transmission=Manual|name=' + 
                   str(x['name']) + 
                   ',year=' + str(x['year']) + 
                   ',km_driven=' + str(x['km_driven']) +
                   ',fuel=' + str(x['fuel']) +
                   ',selling_price=' + str(x['selling_price']) +
                   ',seller_type=' + str(x['seller_type']) +
                   ',owner=' + str(x['owner']) +
                   ',mileage=' + str(x['mileage']) +
                   ',engine=' + str(x['engine']) +
                   ',max_power=' + str(x['max_power'])] = numerator/denominator
        post_probs['transmission=Automatic|name=' + 
                   str(x['name']) + 
                   ',year=' + str(x['year']) + 
                   ',km_driven=' + str(x['km_driven']) +
                   ',fuel=' + str(x['fuel']) +
                   ',selling_price=' + str(x['selling_price']) +
                   ',seller_type=' + str(x['seller_type']) +
                   ',owner=' + str(x['owner']) +
                   ',mileage=' + str(x['mileage']) +
                   ',engine=' + str(x['engine']) +
                   ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
        return post_probs
    
    except(KeyError):
        post_probs['transmission=Manual|name=' + 
                   str(x['name']) + 
                   ',year=' + str(x['year']) + 
                   ',km_driven=' + str(x['km_driven']) +
                   ',fuel=' + str(x['fuel']) +
                   ',selling_price=' + str(x['selling_price']) +
                   ',seller_type=' + str(x['seller_type']) +
                   ',owner=' + str(x['owner']) +
                   ',mileage=' + str(x['mileage']) +
                   ',engine=' + str(x['engine']) +
                   ',max_power=' + str(x['max_power'])] = .5
        post_probs['transmission=Automatic|name=' + 
                   str(x['name']) + 
                   ',year=' + str(x['year']) + 
                   ',km_driven=' + str(x['km_driven']) +
                   ',fuel=' + str(x['fuel']) +
                   ',selling_price=' + str(x['selling_price']) +
                   ',seller_type=' + str(x['seller_type']) +
                   ',owner=' + str(x['owner']) +
                   ',mileage=' + str(x['mileage']) +
                   ',engine=' + str(x['engine']) +
                   ',max_power=' + str(x['max_power'])] = .5  
        return post_probs

def train_test_split(X,y,test_frac=0.5):
    inxs = list(range(len(y)))
    np.random.shuffle(inxs)
    X = X.iloc[inxs,:]
    y = y.iloc[inxs]
    Xtrain = X.iloc[:round(len(X)/2)]
    Xtest = X.iloc[round(len(X)/2):]
    ytrain = y.iloc[:round(len(y)/2)]
    ytest = y.iloc[round(len(y)/2):]
    return Xtrain,ytrain,Xtest,ytest

def exercise_6(Xtrain,ytrain,Xtest,ytest):
    probs = class_conditional(Xtrain,ytrain)
    priors = compute_priors(ytrain)
    posts = {}
    for i in Xtest.index:
        x = Xtest.loc[i]
        post = posteriors(probs,priors,x)
        posts.update(post)

    correct = 0
    for i in Xtest.index:
        x = Xtest.loc[i]
        manual = posts['transmission=Manual|name=' + str(x['name']) + 
                   ',year=' + str(x['year']) + 
                   ',km_driven=' + str(x['km_driven']) +
                   ',fuel=' + str(x['fuel']) +
                   ',selling_price=' + str(x['selling_price']) +
                   ',seller_type=' + str(x['seller_type']) +
                   ',owner=' + str(x['owner']) +
                   ',mileage=' + str(x['mileage']) +
                   ',engine=' + str(x['engine']) +
                   ',max_power=' + str(x['max_power'])]
        automatic = posts['transmission=Automatic|name=' + str(x['name']) + 
                   ',year=' + str(x['year']) + 
                   ',km_driven=' + str(x['km_driven']) +
                   ',fuel=' + str(x['fuel']) +
                   ',selling_price=' + str(x['selling_price']) +
                   ',seller_type=' + str(x['seller_type']) +
                   ',owner=' + str(x['owner']) +
                   ',mileage=' + str(x['mileage']) +
                   ',engine=' + str(x['engine']) +
                   ',max_power=' + str(x['max_power'])]
        if (manual >= automatic) and ytest.loc[i] == "Manual":
            correct += 1
        elif (manual < automatic) and ytest.loc[i] == "Automatic":
            correct += 1
    accuracy = correct/len(Xtest)
    return accuracy

def exercise_7(Xtrain,ytrain,Xtest,ytest, npermutations = 10):
    # initialize what we are going to return
    importances = {}
    for col in Xtrain.columns:
        importances[col] = 0
    # find the original accuracy
    orig_accuracy = exercise_6(Xtrain,ytrain,Xtest,ytest)
    # now carray out the feature importance work
    for col in Xtrain.columns:
        for perm in range(npermutations):
            Xtest2 = Xtest.copy()
            Xtest2[col] = Xtest[col].sample(frac=1, replace=False).values
            new_accuracy = exercise_6(Xtrain,ytrain,Xtest2,ytest)
            importances[col] = importances[col] + (orig_accuracy - new_accuracy)
        importances[col] = importances[col]/npermutations
    return importances

def exercise_8(Xtrain,ytrain,Xtest,ytest, npermutations = 20):
    # initialize what we are going to return
    importances = {}
    for col in Xtrain.columns:
        importances[col] = 0
    # find the original accuracy
    orig_accuracy = exercise_6(Xtrain,ytrain,Xtest,ytest)
    # now carray out the feature importance work
    for col in Xtrain.columns:
        for perm in range(npermutations):
            Xtrain2 = Xtrain.copy()
            Xtrain2[col] = Xtrain[col].sample(frac=1, replace=False).values
            new_accuracy = exercise_6(Xtrain2,ytrain,Xtest,ytest)
            importances[col] = importances[col] + (orig_accuracy - new_accuracy)
        importances[col] = importances[col]/npermutations
    return importances

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv(
    f"{home}/Project/UserCarData.csv",index_col=0
)

features = ['name','year','km_driven','fuel','selling_price', 'seller_type','owner','mileage','engine','max_power']
X = df[features]
X = X.reset_index().drop("Sales_ID", axis=1)
X["selling_price"].loc[:] = round(X["selling_price"] * .013, 0)
t = df['transmission']
t = t.reset_index().drop("Sales_ID", axis=1)["transmission"]
X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["selling_price"].loc[:] = round(X["selling_price"] * .013, 0)


Unnamed: 0,name,year,km_driven,fuel,selling_price,seller_type,owner,mileage,engine,max_power
0,Maruti,2014,145500,Diesel,5850,Individual,First_Owner,23.4,1248,74.0
1,Skoda,2014,120000,Diesel,4810,Individual,Second_Owner,21.14,1498,103.52
2,Honda,2006,140000,Petrol,2054,Individual,Third_Owner,17.7,1497,78.0
3,Hyundai,2010,127000,Diesel,2925,Individual,First_Owner,23.0,1396,90.0
4,Maruti,2007,120000,Petrol,1690,Individual,First_Owner,16.1,1298,88.2


In [4]:
for col in ["km_driven", "selling_price", "mileage", "engine", "max_power"]:
    X[col] = bin_column(X[col])
X.head()

Unnamed: 0,name,year,km_driven,fuel,selling_price,seller_type,owner,mileage,engine,max_power
0,Maruti,2014,"(120000.0, 2360457.0]",Diesel,"(4550.0, 5850.0]",Individual,First_Owner,"(22.32, 24.04]","(1199.0, 1248.0]","(68.05, 74.0]"
1,Skoda,2014,"(95425.0, 120000.0]",Diesel,"(4550.0, 5850.0]",Individual,Second_Owner,"(20.706, 22.32]","(1461.0, 1582.0]","(102.0, 130.0]"
2,Honda,2006,"(120000.0, 2360457.0]",Petrol,"(389.999, 2340.0]",Individual,Third_Owner,"(16.78, 18.2]","(1461.0, 1582.0]","(74.0, 82.0]"
3,Hyundai,2010,"(120000.0, 2360457.0]",Diesel,"(2340.0, 3510.0]",Individual,First_Owner,"(22.32, 24.04]","(1248.0, 1461.0]","(88.73, 102.0]"
4,Maruti,2007,"(95425.0, 120000.0]",Petrol,"(389.999, 2340.0]",Individual,First_Owner,"(14.4, 16.78]","(1248.0, 1461.0]","(82.0, 88.73]"


In [5]:
# specific_class_conditional(X["year"], 2014, t, "Manual")

# priors = compute_priors(t)
# probs = class_conditional(X,t)
# x = X.loc[16]
# posteriors(probs, priors, x)

Xtrain, ytrain, Xtest, ytest = train_test_split(X, t)
exercise_6(Xtrain, ytrain, Xtest, ytest)

0.9172780166961801

In [16]:
#Model is only slightly more accurate than if we were to guess manual for every transmission
print("Manual for every prediction accuracy:", sum(t == "Manual")/len(t))

Manual for every prediction accuracy: 0.8683278522641033


In [7]:
exercise_7(Xtrain, ytrain, Xtest, ytest)

  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['ma

{'name': 0.042221097900328894,
 'year': 0.002504427017455124,
 'km_driven': 0.002934480141664586,
 'fuel': 0.0009359979762206261,
 'selling_price': -0.0015431317986339232,
 'seller_type': 0.007285605868960298,
 'owner': -0.0019984821654439712,
 'mileage': -0.0008095117632177962,
 'engine': 0.0012395648874273025,
 'max_power': 0.005388312673918561}

In [8]:
exercise_8(Xtrain, ytrain, Xtest, ytest)

  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator
  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['ma

{'name': 0.012458891980774123,
 'year': 0.0026056159878573458,
 'km_driven': -0.002504427017455085,
 'fuel': -0.00032886415380720677,
 'selling_price': -0.005236529218315156,
 'seller_type': 0.006716417910447775,
 'owner': -0.003807235011383736,
 'mileage': -0.0019099418163419923,
 'engine': 0.0012016190235264678,
 'max_power': 0.007677713129268943}

## SKLearn

In [9]:
#Create dummy variables
X2 = pd.get_dummies(X, columns=['name', 'km_driven', 'fuel', 'selling_price', 'seller_type', 'owner', 'mileage', 'engine', 'max_power'])

In [11]:
#Training
X_train, y_train, X_test, y_test = train_test_split(X2, t)

import numpy as np
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
clf.fit(X_train, y_train)

CategoricalNB()

In [15]:
#Accuracy
count = 0
for i in X_test.index:
    try:
        if clf.predict(X2[i:i+1]) == t[i]:
            count += 1
    except(IndexError):
        count+=0
        
print("Sklearn accuracy:", count/len(y_train))

Sklearn accuracy: 0.9094358714900076


## Comparison

In [13]:
#Scratch

tracker = []
for _ in range(10):
    Xtrain, ytrain, Xtest, ytest = train_test_split(X, t)
    tracker += [exercise_6(Xtrain, ytrain, Xtest, ytest)]

  ',max_power=' + str(x['max_power'])] = numerator/denominator
  ',max_power=' + str(x['max_power'])] = 1 - numerator/denominator


In [17]:
#Accuracy
count = 0
for i in X_test.index:
    try:
        if clf.predict(X2[i:i+1]) == t[i]:
            count += 1
    except(IndexError):
        count+=0
        
print("Sklearn accuracy:", count/len(y_train))
print("scratch accuracy:", sum(tracker)/len(tracker))
print("Manual for every prediction accuracy:", sum(t == "Manual")/len(t))

Sklearn accuracy: 0.9094358714900076
scratch accuracy: 0.9067543637743487
Manual for every prediction accuracy: 0.8683278522641033
