In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from functools import partial
from tqdm.cli import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split

from src.utils import fprint
from src.utils import percent_of
from src.utils import mape
from src.utils import create_submit_df

In [3]:
data_folder = Path("data")
submissions_folder = Path("submissions")

train_df = pd.read_csv(data_folder / "train.csv", index_col=0)
test_df = pd.read_csv(data_folder / "test_no_target.csv", index_col=0)
zipcodes_df = pd.read_csv(data_folder / "zipcodes.csv", index_col=0)

train_df = pd.merge(train_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")
test_df = pd.merge(test_df.reset_index(), zipcodes_df.drop_duplicates("zipcode"), on="zipcode", how="left")

In [4]:
cat_features = ["type", "gearbox", "model", "fuel", "brand", "city"]

In [5]:
percent_of(train_df.insurance_price.isna())

0.14662

In [6]:
train_df.registration_year.max()

2016

In [10]:
train_df, val_df = train_test_split(train_df)

In [13]:
test_df

Unnamed: 0,index,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,city,latitude,longitude
0,60314,1.6,small car,2013,manual,136,swift,40000,gasoline,suzuki,0.0,30449,490.0,Hannover,52.384470,9.726930
1,12566,,coupé,2004,auto,333,6er,150000,gasoline,bmw,0.0,45307,670.0,"Essen, Ruhr",51.462488,7.008645
2,17760,,station wagon,2006,auto,170,e_klasse,150000,diesel,mercedes_benz,0.0,59494,460.0,"Soest, Westfalen",51.566980,8.110620
3,8876,,limousine,99,manual,101,astra,150000,gasoline,opel,,25524,,Heiligenstedtenerkamp,53.899874,9.468966
4,80392,,limousine,1975,manual,54,andere,150000,diesel,mercedes_benz,0.0,70794,1110.0,Filderstadt,48.666700,9.216670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,93878,1.4,limousine,1999,manual,86,corolla,150000,gasoline,toyota,0.0,44339,110.0,Dortmund,51.504225,7.483654
49996,99783,,station wagon,2002,auto,184,3er,150000,diesel,bmw,0.0,47574,240.0,Goch,51.679880,6.156480
49997,57399,,small car,2005,manual,52,fox,100000,gasoline,volkswagen,0.0,50389,60.0,"Wesseling, Rheinland",50.833300,6.983330
49998,97106,,bus,2001,manual,151,transporter,150000,diesel,volkswagen,0.0,12209,930.0,Berlin Steglitz-Zehlendorf,52.443640,13.229080


In [15]:
val_df.shape, test_df.shape

((9375, 17), (50000, 16))

In [24]:

    
unzip_val_df, unzip_test_df = unzip_dataframes(zip_dataframes(val_df, test_df))

In [27]:
unzip_val_df.shape, unzip_test_df.shape

((9375, 17), (50000, 17))

In [28]:
val_df.head()

Unnamed: 0,index,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,price,city,latitude,longitude,df_order
9794,48812,2.0,bus,2005,auto,140,touran,150000,diesel,volkswagen,1.0,89155,420.0,3458,Erbach (Donau),48.3333,9.88333,0
25077,25402,,limousine,2003,manual,143,c_klasse,150000,gasoline,mercedes_benz,0.0,65589,390.0,4823,"Hadamar, Westerwald",50.45,8.05,0
49606,21276,,,2000,manual,101,zafira,150000,,opel,0.0,47137,60.0,818,Duisburg,51.43511,6.76301,0
4715,17797,2.5,limousine,2002,auto,179,a8,150000,diesel,audi,0.0,48268,120.0,2548,"Greven, Westfalen",52.096678,7.616927,0
26701,65073,,limousine,2001,manual,192,3er,150000,liquefied petroleum gas,bmw,0.0,35713,430.0,3822,Eschenburg,50.8083,8.35833,0


In [29]:
unzip_val_df.head()

Unnamed: 0,index,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,price,city,latitude,longitude
9794,48812,2.0,bus,2005,auto,140,touran,150000,diesel,volkswagen,1.0,89155,420.0,3458.0,Erbach (Donau),48.3333,9.88333
25077,25402,,limousine,2003,manual,143,c_klasse,150000,gasoline,mercedes_benz,0.0,65589,390.0,4823.0,"Hadamar, Westerwald",50.45,8.05
49606,21276,,,2000,manual,101,zafira,150000,,opel,0.0,47137,60.0,818.0,Duisburg,51.43511,6.76301
4715,17797,2.5,limousine,2002,auto,179,a8,150000,diesel,audi,0.0,48268,120.0,2548.0,"Greven, Westfalen",52.096678,7.616927
26701,65073,,limousine,2001,manual,192,3er,150000,liquefied petroleum gas,bmw,0.0,35713,430.0,3822.0,Eschenburg,50.8083,8.35833
