In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

sns.set_palette("GnBu_d")
sns.set_style('whitegrid')

In [29]:
df = pd.read_csv("data/autos.csv", header=0, sep=',', encoding='Latin1',)

In [30]:
print(df.seller.value_counts())
df[df.seller !='gewerblish']
df=df.drop('seller',1)
print(df.offerType.value_counts())
df[df.offerType != 'Gesuch']
df=df.drop('offerType',1)

privat        371525
gewerblich         3
Name: seller, dtype: int64
Angebot    371516
Gesuch         12
Name: offerType, dtype: int64


In [31]:
print(df.shape)
df=df[(df.powerPS > 50) & (df.powerPS < 900)]
print(df.shape)
df = df[(df.yearOfRegistration >=1950) & (df.yearOfRegistration < 2017)]
print(df.shape)

(371528, 18)
(319709, 18)
(309171, 18)


In [32]:
df.drop(['name','abtest', 'dateCrawled', 'nrOfPictures','lastSeen','postalCode','dateCreated'], axis='columns', inplace=True)

In [33]:
new_df = df.copy()
new_df = new_df.drop_duplicates(['price', 'vehicleType', 'yearOfRegistration', 'gearbox','powerPS', 'model', 'kilometer','monthOfRegistration','fuelType','notRepairedDamage'])


In [34]:
new_df.gearbox.replace(('manuell','automatik'), ('manual','automatic'), inplace=True)
new_df.fuelType.replace(('benzin','andere','elektro'),('petrol','others','electric'),inplace=True)
new_df.vehicleType.replace(('kleinwagen','cabrio','kombi','andere'),('small car','convertible','combination','others'),inplace=True)
new_df.notRepairedDamage.replace(('ja','nein'),('Yes','No'), inplace=True)

In [35]:
new_df = new_df[(new_df.price >=100) & (new_df.price<= 150000)]


In [36]:
new_df['notRepairedDamage'].fillna(value='not-declared', inplace=True)
new_df['fuelType'].fillna(value='not-declared', inplace=True)
new_df['gearbox'].fillna(value='not-declared', inplace=True)
new_df['vehicleType'].fillna(value='not-declared', inplace=True)
new_df['model'].fillna(value='not-declared', inplace=True)

In [37]:
new_df.to_csv("autos_preprocessed.csv")

In [38]:
labels = ['gearbox', 'notRepairedDamage', 'model', 'brand', 'fuelType', 'vehicleType']

In [39]:
mapper = {}
for i in labels:
    mapper[i] = LabelEncoder()
    mapper[i].fit(new_df[i])
    tr = mapper[i].transform(new_df[i])
    np.save(str('classes'+i+'.npy'), mapper[i].classes_)
    print(i,":",mapper[i])
    new_df.loc[:,i + '_labels'] = pd.Series(tr, index=new_df.index)
    

gearbox : LabelEncoder()
notRepairedDamage : LabelEncoder()
model : LabelEncoder()
brand : LabelEncoder()
fuelType : LabelEncoder()
vehicleType : LabelEncoder()


In [40]:
labeled = new_df[['price'
                 ,'yearOfRegistration'
                 ,'powerPS'
                 ,'kilometer'
                 ,'monthOfRegistration'
                 ]
                +[x+"_labels" for x in labels]]

In [41]:
print(labeled.columns)

Index(['price', 'yearOfRegistration', 'powerPS', 'kilometer',
       'monthOfRegistration', 'gearbox_labels', 'notRepairedDamage_labels',
       'model_labels', 'brand_labels', 'fuelType_labels',
       'vehicleType_labels'],
      dtype='object')
