In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('completeData.csv')
data

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,audi,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,audi,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,audi,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,audi,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,audi,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0
...,...,...,...,...,...,...,...,...,...,...
97438,vw,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0
97439,vw,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2
97440,vw,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4
97441,vw,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2


In [3]:
data.dtypes

make             object
model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [4]:
data['mileage'].mean(), data['mileage'].std()

(23224.549459684124, 21069.244114358004)

In [5]:
for label, content in data.select_dtypes(exclude='number').items():
    print(label)
    print(f'unique values: {len(content.unique())}\n')

make
unique values: 9

model
unique values: 194

transmission
unique values: 4

fuelType
unique values: 5



In [6]:
#make dummies for make, transmission, fuelType
#model will require a different strategy
dummies = pd.get_dummies(data.select_dtypes(exclude='number').drop(columns='model'))
data = data.drop(columns=['make','transmission','fuelType']).join(dummies)
data

Unnamed: 0,model,year,price,mileage,tax,mpg,engineSize,make_audi,make_bmw,make_ford,...,make_vw,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,A1,2017,12500,15735,150,55.4,1.4,1,0,0,...,0,0,1,0,0,0,0,0,0,1
1,A6,2016,16500,36203,20,64.2,2.0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2,A1,2016,11000,29946,30,55.4,1.4,1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,A4,2017,16800,25952,145,67.3,2.0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,A3,2019,17300,1998,145,49.6,1.0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97438,Eos,2012,5990,74000,125,58.9,2.0,0,0,0,...,1,0,1,0,0,1,0,0,0,0
97439,Fox,2008,1799,88102,145,46.3,1.2,0,0,0,...,1,0,1,0,0,0,0,0,0,1
97440,Fox,2009,1590,70000,200,42.0,1.4,0,0,0,...,1,0,1,0,0,0,0,0,0,1
97441,Fox,2006,1250,82704,150,46.3,1.2,0,0,0,...,1,0,1,0,0,0,0,0,0,1


In [7]:
#create column for average model price
#set equal to average model price not counting current row
data['avgModelPrice'] = 0
allModels = data.model.unique()
for mod in allModels:
    data.loc[data.model==mod, 'avgModelPrice'] = (data[data.model==mod].price.sum() - data[data.model==mod].price) \
    /float(data[data.model==mod].price.count()-1)
data.drop(columns='model', inplace=True)
data.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,make_audi,make_bmw,make_ford,make_hyundai,...,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,avgModelPrice
0,2017,12500,15735,150,55.4,1.4,1,0,0,0,...,0,1,0,0,0,0,0,0,1,14272.146067
1,2016,16500,36203,20,64.2,2.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,22703.678715
2,2016,11000,29946,30,55.4,1.4,1,0,0,0,...,0,1,0,0,0,0,0,0,1,14273.269663
3,2017,16800,25952,145,67.3,2.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,20248.330663
4,2019,17300,1998,145,49.6,1.0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,17419.136197


In [8]:
#normally scaling before the split is a bad idea
#the directions however say to do it in reverse order, I will have to ask if this is intended

#scale numeric columns except target, year
sclr = StandardScaler()
for label in ['mileage','tax','mpg','engineSize']:
    data.loc[:,label] = sclr.fit_transform(data[label].to_numpy().reshape(-1, 1))

In [9]:
#honestly doing the split now when the processing will be done in a different notebook is weird
X = data.drop(columns='price')
y = data.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=191)

In [10]:
#export data
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)