Here I'm going to basically try to predict the missing values in the "Market Category" column

In [1]:
import numpy as np
import pandas as pd
from numpy import mean, std
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df['Engine Fuel Type'].fillna(method='ffill', inplace=True)
df['Engine HP'].fillna(df['Engine HP'].median(), inplace = True)
df['Engine Cylinders'].fillna(df['Engine Cylinders'].mean(), inplace = True)
df['Number of Doors'].fillna(method='ffill', inplace = True)
df.drop('Model', axis=1, inplace=True)

In [4]:
#we need to drop the model of the car
#to OHE categorical columns

categorical_columns = ['Make', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Vehicle Size', 'Vehicle Style']

#encoding of categorical columns
for column in categorical_columns:
    tempdf = pd.get_dummies(df[column],  prefix=column)
    df = pd.concat([df, tempdf], axis=1)

    df.drop(column, axis=1, inplace=True)

In [5]:
#Market Category
#this is a categorical column
lst = df['Market Category'].unique()

category_list = []
#loop extracts unique label for the market category column 
for i in lst:
    if isinstance(i, str):
        if ',' in i:
            new_list = i.split(',')
            for j in new_list:
                if j in category_list:
                    print(j, 'already found')
                    continue
                else:
                    category_list.append(j)
                    print(j, 'added')
        else:
            if i in category_list:
                print(i, 'already found')
                continue
            else:
                category_list.append(i)
                print(i, 'added')


Factory Tuner added
Luxury added
High-Performance added
Luxury already found
Performance added
Luxury already found
High-Performance already found
Luxury already found
Performance already found
Flex Fuel added
Flex Fuel already found
Performance already found
Hatchback added
Hatchback already found
Luxury already found
Performance already found
Hatchback already found
Luxury already found
Luxury already found
High-Performance already found
Hybrid added
Diesel added
Luxury already found
Hatchback already found
Performance already found
Hatchback already found
Factory Tuner already found
Performance already found
High-Performance already found
Factory Tuner already found
High-Performance already found
Exotic added
High-Performance already found
Exotic already found
Factory Tuner already found
High-Performance already found
Factory Tuner already found
Performance already found
Crossover added
Exotic already found
Luxury already found
Exotic already found
Luxury already found
High-Performa

In [6]:
#create new df containg rows with missing market category (test_df)
prediction_df = df.loc[df['Market Category'].isna()]
#drop rows with missing market category
df.drop(df.index[df['Market Category'].isna()], inplace=True)
#drop market category column
prediction_df.drop('Market Category', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df.drop('Market Category', axis=1, inplace=True)


In [7]:
#output column ('y')
df_y = df.pop('Market Category')
#changing format of column rows into one that is easier to encode
df_y = df_y.apply(lambda x: [x])

In [8]:
df = df.reset_index()
#df_y = df_y.reset_index()

In [9]:
#custom OHE function for y_column

def ohe_list(df_col, categories):
    #create a dictionary map
    label_to_int = dict((c, i) for i, c in enumerate(categories))
    int_to_label = dict((i, c) for i, c in enumerate(categories))

    #encode to integer
    label_encoded = [[[label_to_int[label] for label in cell.split(',') ] for cell in row] for row in df_col]

    #create one hot list
    oh_list = list()

    for row in label_encoded:
        for cell in row:
            cell_enc = [0 for _ in range(len(categories))]
            for label in cell:
                cell_enc[label] = 1
            oh_list.append(cell_enc)

    return oh_list 


In [10]:
df_y_enc = ohe_list(df_y, category_list)

In [11]:
df_y_enc = pd.DataFrame(df_y_enc)
df_y_enc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,1,1,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8167,0,1,0,0,0,1,0,0,0,1
8168,0,1,0,0,0,1,0,0,0,1
8169,0,1,0,0,0,1,0,0,0,1
8170,0,1,0,0,0,1,0,0,0,1


In [12]:
def get_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(n_outputs, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam')
	return model

In [15]:
def evaluate_model(X, y):
    results = list()
    n_inputs, n_outputs = X.shape[1], y.shape[1]

    #defining evaluation precedure
    cross_val = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    for train, test in cross_val.split(X):
        X_train, X_test = X.loc[train, X.columns], X.loc[test, X.columns]
        y_train, y_test = y.loc[train, y.columns], y.loc[test, y.columns]

        model = get_model(n_inputs, n_outputs)
        model_history = model.fit(X_train, y_train, epochs=10)

        y_hat = model.predict(X_test)
        y_hat = y_hat.round()

        acc = accuracy_score(y_test, y_hat)
        print('\n >%.3f \n\n' %acc)
        results.append(acc)
    return results, model_history
        


In [None]:
results, model_hist = evaluate_model(df, df_y_enc)
print('Accuracy: %.3f (%.3f)' % (mean(results), std(results)))

#avg accuracy is 30% (really poor)