Columns description

    Id (str)
    city
    price
    year: Year of manufacturing
    manufacturer: Manufacturer of vehicle
    make: Model of vehicle
    condition: Vehicle condition
    cylinders: Number of cylinders
    fuel: Type of fuel required
    odometer: Miles traveled
    title_status: Title status (e.g. clean, missing, etc.)
    transmission: Type of transmission
    drive: Drive of vehicle
    size: Size of vehicle
    type: Type of vehicle
    paint_color: Color of vehicle
    lat: Latitude of listing
    long: Longitude of listing
    county_fips: Federal Information Processing Standards code
    county_name: County of listing
    state_fips: Federal Information Processing Standards code
    state_code: letter state code
    state_name: State name
    weather: Historical average


In [195]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [196]:
cars_prices=pd.read_csv("..\data\cars_train.csv")
cars_prices_test=pd.read_csv("..\data\cars_test.csv")

In [197]:
for column in cars_prices.columns:
    if cars_prices[column].dtype=='object':
        print(column)
        print(cars_prices[column].value_counts())

city
lasvegas         2270
sfbay            2259
nashville        2251
miami            2245
boise            2243
indianapolis     2241
cosprings        2238
anchorage        2238
sacramento       2232
orlando          2227
grandrapids      2219
memphis          2218
madison          2207
inlandempire     2203
kansascity       2198
bakersfield      2197
charlotte        2189
columbus         2189
nh               2182
minneapolis      2179
oklahomacity     2177
milwaukee        2176
hartford         2172
tampa            2170
akroncanton      2167
chicago          2163
omaha            2162
losangeles       2159
desmoines        2157
orangecounty     2157
                 ... 
reddeer            32
regina             32
londonon           31
cornwall           30
juneau             29
kingston           28
chihuahua          25
peterborough       24
soo                23
hermosillo         21
juarez             21
peace              21
newbrunswick       20
lethbridge         19
brant

In [198]:
def remove_columns(data,treshold):
    for column in data.columns:
        n_rows=len(data[column])
        n_nulls=data[column].isna().sum()
        percentage_nulls=(n_nulls/n_rows)*100
        if percentage_nulls>treshold:
            data.drop(labels=column,axis=1,inplace=True)
    return data
cars_prices=remove_columns(cars_prices,60)
cars_prices.columns

Index(['Id', 'city', 'year', 'manufacturer', 'make', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'type',
       'paint_color', 'lat', 'long', 'county_fips', 'county_name',
       'state_fips', 'state_code', 'state_name', 'weather', 'price'],
      dtype='object')

In [199]:
#Quitamos Outliers precio
q_25=cars_prices.price.quantile(0.05)
q_75=cars_prices.price.quantile(0.95)
cars_prices=cars_prices[(cars_prices.price>=q_25) & (cars_prices.price<=q_75) ]

In [200]:
#Rellenamos los valores de kilometraje que faltan aplicando un modelo sencillo.
def predict_odometer(cars_prices):
    df_odo=cars_prices.dropna()
    X=df_odo[['year']]
    y=df_odo['odometer']
    from sklearn.model_selection import train_test_split

    X_train, X_test, Y_train,Y_test=train_test_split(X,y,test_size=0.33,random_state=1)

    from sklearn.linear_model import LinearRegression
    reg_odometer = LinearRegression(normalize=True).fit(X_train, Y_train)
    
    reg_odometer.fit(X, y)
    y_real=reg_odometer.predict(np.array(cars_prices['year'].fillna(method='ffill')).reshape(-1,1))
    
    return pd.Series(y_real)

cars_prices['odometer'].fillna(predict_odometer(cars_prices),inplace=True)
cars_prices_test['odometer'].fillna(predict_odometer(cars_prices_test),inplace=True)

In [201]:
cars_prices['odometer'].isna().value_counts()

False    409729
True      13305
Name: odometer, dtype: int64

In [202]:
cars_prices.condition.replace({'like new':'excellent','fair':'good'},inplace=True)
cars_prices_test.condition.replace({'like new':'excellent','fair':'good'},inplace=True)
cars_prices.condition.value_counts()

excellent    138526
good         114359
new            1316
salvage         831
Name: condition, dtype: int64

In [203]:
pd.Series(cars_prices.condition.unique()).dropna()

0    excellent
2         good
3      salvage
4          new
dtype: object

In [204]:
def predict_condition(cars_prices):
    from sklearn import preprocessing
    df_odo=cars_prices.dropna()

    le = preprocessing.LabelEncoder()
    le.fit(df_odo.condition.unique())
    df_odo.condition=le.transform(df_odo.condition) 

    #Entrenamos el modelo

    X=df_odo[['year','odometer']]
    y=df_odo['condition']

    from sklearn.model_selection import train_test_split

    X_train, X_test, Y_train,Y_test=train_test_split(X,y,test_size=0.33,random_state=1)
    from sklearn.linear_model import LinearRegression
    reg_condition = LinearRegression(normalize=True).fit(X_train, Y_train)

    #Labelizamos

    from sklearn import preprocessing
    cars_prices.condition.fillna('desconocido',inplace=True)
    le = preprocessing.LabelEncoder()
    le.fit(cars_prices.condition.unique())
    cars_prices.condition=le.transform(cars_prices.condition) 

    #Aplicamos modelo

    condition=[]
    for row in cars_prices[['condition','year','odometer']].values:
        test_X = np.array(row[1:]).reshape(-1,1)
        try:
            predict=reg.predict(test_X)[0]
        except:
            predict=row[0]
        condition.append(predict)
    return condition
    
cars_prices['condition']=predict_condition(cars_prices)
cars_prices_test['condition']=predict_condition(cars_prices_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [205]:
def process_title_status(cars_prices):
    cars_prices['title_status'].fillna('other',inplace=True)
    cars_prices['title_status'].replace({'clean':'good','rebuilt':'regular','lien':'bad','missing':'bad','parts only':'bad','salvage':'bad'},inplace=True)
    from sklearn import preprocessing

    le = preprocessing.LabelEncoder()
    le.fit(cars_prices['title_status'].unique())
    cars_prices['title_status']=le.transform(cars_prices['title_status'])
    return cars_prices

cars_prices=process_title_status(cars_prices)
cars_prices_test=process_title_status(cars_prices_test)

In [206]:
cars_country=pd.read_csv("..\data\cars_manufacturers.csv")

def add_manufacturer_country(cars_country,cars_prices):
    
    cars_prices=pd.merge(cars_prices, cars_country, on='manufacturer', how='left')
    cars_prices=pd.get_dummies(cars_prices, prefix=['country'], columns=['country'])
    return cars_prices

cars_prices=add_manufacturer_country(cars_country,cars_prices)
cars_prices_test=add_manufacturer_country(cars_country,cars_prices_test)

In [207]:

cars_prices_test['title_status'].isna().value_counts()

False    253073
Name: title_status, dtype: int64

In [208]:
cars_prices_test.head()

Unnamed: 0,Id,city,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,...,state_code,state_name,weather,country_germany,country_italy,country_japan,country_south korea,country_sweden,country_united kingdom,country_united states
0,974298,duluth,2006.0,ford,f-250 super duty,2.0,8 cylinders,gas,154400.0,1,...,MN,Minnesota,43.0,0,0,0,0,0,0,1
1,1051884,kansascity,1987.0,chevrolet,,0.0,,gas,169914.138187,1,...,MO,Missouri,52.0,0,0,0,0,0,0,1
2,684464,palmsprings,2010.0,jeep,liberty sport,0.0,6 cylinders,gas,127722.0,1,...,CA,California,59.0,0,0,0,0,0,0,1
3,1255387,sanmarcos,2003.0,chevrolet,tahoe,2.0,8 cylinders,gas,131962.455018,1,...,TX,Texas,67.0,0,0,0,0,0,0,1
4,1195520,tampa,2006.0,lexus,gs 300,0.0,,gas,124846.514424,1,...,FL,Florida,65.0,0,0,1,0,0,0,0


In [209]:
#Eliminamos los nulos
cars_prices.dropna(inplace=True)

In [210]:
def get_n_cylinders(cylinders):
    cylinders=cylinders.replace('other','0 cylinders')
    cylinders.fillna('0 cylinders',inplace=True)
    n_cylinders=[int(x[0]) for x in cylinders.str.split(" ")]
    return n_cylinders

cars_prices.cylinders=get_n_cylinders(cars_prices.cylinders)
cars_prices_test.cylinders=get_n_cylinders(cars_prices_test.cylinders)

In [213]:

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(pd.Series(cars_prices.condition.unique()).dropna())
cars_prices.condition=le.transform(cars_prices.condition)



In [214]:
cars_prices.corr()

Unnamed: 0,Id,year,condition,cylinders,odometer,title_status,lat,long,county_fips,state_fips,weather,price,country_germany,country_italy,country_japan,country_south korea,country_sweden,country_united kingdom,country_united states
Id,1.0,0.020121,0.008713,-0.019492,0.00372,-0.001975,-0.01127,0.134658,0.231966,0.231536,-0.026383,-0.014233,-0.00471,-0.001784,-0.00015,0.012228,0.001318,-0.002222,0.002878
year,0.020121,1.0,-0.313709,-0.257482,-0.131695,0.050838,-0.006162,-0.007659,-0.001921,-0.002172,0.000483,0.355028,0.020476,0.007026,0.111391,0.105848,-0.005659,0.000603,-0.113877
condition,0.008713,-0.313709,1.0,0.108637,0.160179,-0.026568,0.014879,0.091682,0.010425,0.010871,-0.017474,-0.398429,-0.025948,-0.008634,-0.035666,-0.044821,0.01238,-0.002499,0.044671
cylinders,-0.019492,-0.257482,0.108637,1.0,0.090223,-0.039036,0.003068,-0.001878,0.009352,0.009272,-0.019359,0.094673,-0.084435,-0.038469,-0.318082,-0.153633,-0.031369,-0.010403,0.309918
odometer,0.00372,-0.131695,0.160179,0.090223,1.0,-0.027847,-0.002017,0.024212,0.002087,0.002127,-0.002164,-0.22906,-0.026226,-0.014669,-0.003553,-0.04384,0.01758,-0.014194,0.0221
title_status,-0.001975,0.050838,-0.026568,-0.039036,-0.027847,1.0,-0.009676,0.024444,0.02932,0.029273,0.002665,-0.020556,0.002213,0.001249,0.03615,0.00902,-0.001269,-0.006452,-0.027314
lat,-0.01127,-0.006162,0.014879,0.003068,-0.002017,-0.009676,1.0,-0.183161,0.196001,0.197683,-0.867291,0.003362,-0.012944,-0.009713,-0.03439,-0.008698,0.003361,-0.012779,0.036095
long,0.134658,-0.007659,0.091682,-0.001878,0.024212,0.024444,-0.183161,1.0,0.253866,0.25374,0.056018,-0.111909,-0.018088,-0.002171,-0.011268,0.009135,0.016405,-0.005616,0.019198
county_fips,0.231966,-0.001921,0.010425,0.009352,0.002087,0.02932,0.196001,0.253866,1.0,0.99998,-0.147591,-0.016779,-0.024969,0.000146,-0.015444,0.009841,0.001096,-0.00558,0.027448
state_fips,0.231536,-0.002172,0.010871,0.009272,0.002127,0.029273,0.197683,0.25374,0.99998,1.0,-0.149109,-0.017017,-0.024913,0.000123,-0.015387,0.009812,0.001127,-0.005564,0.027378


In [215]:
PARAMETROS=['year','odometer','title_status','long','cylinders']
X=cars_prices[PARAMETROS]
y=cars_prices['price']

In [216]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train,Y_test=train_test_split(X,y,test_size=0.33,random_state=1)

In [217]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


In [218]:
#reg = LinearRegression(normalize=True).fit(X_train, Y_train)
reg = RandomForestRegressor(max_depth=3, random_state=5,n_estimators=100).fit(X_train, Y_train)
#reg.fit(X_train, Y_train)

In [219]:
y_pre=reg.predict(X_test)

In [220]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pre,Y_test )

25502104.748694878

In [221]:
cars_prices.corr()

Unnamed: 0,Id,year,condition,cylinders,odometer,title_status,lat,long,county_fips,state_fips,weather,price,country_germany,country_italy,country_japan,country_south korea,country_sweden,country_united kingdom,country_united states
Id,1.0,0.020121,0.008713,-0.019492,0.00372,-0.001975,-0.01127,0.134658,0.231966,0.231536,-0.026383,-0.014233,-0.00471,-0.001784,-0.00015,0.012228,0.001318,-0.002222,0.002878
year,0.020121,1.0,-0.313709,-0.257482,-0.131695,0.050838,-0.006162,-0.007659,-0.001921,-0.002172,0.000483,0.355028,0.020476,0.007026,0.111391,0.105848,-0.005659,0.000603,-0.113877
condition,0.008713,-0.313709,1.0,0.108637,0.160179,-0.026568,0.014879,0.091682,0.010425,0.010871,-0.017474,-0.398429,-0.025948,-0.008634,-0.035666,-0.044821,0.01238,-0.002499,0.044671
cylinders,-0.019492,-0.257482,0.108637,1.0,0.090223,-0.039036,0.003068,-0.001878,0.009352,0.009272,-0.019359,0.094673,-0.084435,-0.038469,-0.318082,-0.153633,-0.031369,-0.010403,0.309918
odometer,0.00372,-0.131695,0.160179,0.090223,1.0,-0.027847,-0.002017,0.024212,0.002087,0.002127,-0.002164,-0.22906,-0.026226,-0.014669,-0.003553,-0.04384,0.01758,-0.014194,0.0221
title_status,-0.001975,0.050838,-0.026568,-0.039036,-0.027847,1.0,-0.009676,0.024444,0.02932,0.029273,0.002665,-0.020556,0.002213,0.001249,0.03615,0.00902,-0.001269,-0.006452,-0.027314
lat,-0.01127,-0.006162,0.014879,0.003068,-0.002017,-0.009676,1.0,-0.183161,0.196001,0.197683,-0.867291,0.003362,-0.012944,-0.009713,-0.03439,-0.008698,0.003361,-0.012779,0.036095
long,0.134658,-0.007659,0.091682,-0.001878,0.024212,0.024444,-0.183161,1.0,0.253866,0.25374,0.056018,-0.111909,-0.018088,-0.002171,-0.011268,0.009135,0.016405,-0.005616,0.019198
county_fips,0.231966,-0.001921,0.010425,0.009352,0.002087,0.02932,0.196001,0.253866,1.0,0.99998,-0.147591,-0.016779,-0.024969,0.000146,-0.015444,0.009841,0.001096,-0.00558,0.027448
state_fips,0.231536,-0.002172,0.010871,0.009272,0.002127,0.029273,0.197683,0.25374,0.99998,1.0,-0.149109,-0.017017,-0.024913,0.000123,-0.015387,0.009812,0.001127,-0.005564,0.027378


In [222]:
#X_submision=np.array(cars_prices_test[['year','odometer']].dropna()).reshape(-1, 1)
X_submision=cars_prices_test[PARAMETROS]

In [223]:
X_submision.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253073 entries, 0 to 253072
Data columns (total 5 columns):
year            252162 non-null float64
odometer        253073 non-null float64
title_status    253073 non-null int32
long            253073 non-null float64
cylinders       253073 non-null int64
dtypes: float64(3), int32(1), int64(1)
memory usage: 10.6 MB


In [224]:
ids=cars_prices_test['Id']
X_submission=cars_prices_test[PARAMETROS]

In [225]:
X_submission.fillna(method='ffill',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [226]:
predict=pd.Series(reg.predict(X_submission))

submision=pd.concat([ids, predict], axis=1)

In [227]:
submision.columns=['id','price']

In [228]:
submision.to_csv(path_or_buf='../output/submision.csv',header=True,index=False)