In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## Read data

In [2]:
# read and combine data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
macro = pd.read_csv('./data/macro.csv')
data = pd.concat([train[train.columns[:-1]], test])
train_y = np.log(train['price_doc'].values + 1)

print(train.shape, test.shape, data.shape, macro.shape)

(30471, 292) (7662, 291) (38133, 291) (2484, 100)


In [3]:
data.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,1,2011-08-20,43.0,27.0,4.0,,,,,,...,40,9,4,0,13,22,1,0,52,4
1,2,2011-08-23,34.0,19.0,3.0,,,,,,...,36,15,3,0,15,29,1,10,66,14
2,3,2011-08-27,43.0,29.0,2.0,,,,,,...,25,10,3,0,11,27,0,4,67,10
3,4,2011-09-01,89.0,50.0,9.0,,,,,,...,15,11,2,1,4,4,0,0,26,3
4,5,2011-09-05,77.0,77.0,4.0,,,,,,...,552,319,108,17,135,236,2,91,195,14


In [4]:
# find missing values and unique values
missing = data.isnull().sum(axis=0).reset_index()
missing.columns = ['feature', 'num']
missing['ratio'] = missing['num'] / len(data)
missing['dtype'] = data.dtypes.values

# calculate the number of unique values
unique_values = []
for i in data.columns:
    unique_values.append(len(data[i].unique()))
missing['unique'] = unique_values

missing

Unnamed: 0,feature,num,ratio,dtype,unique
0,id,0,0.000000,int64,38133
1,timestamp,0,0.000000,object,1435
2,full_sq,0,0.000000,float64,1404
3,life_sq,7559,0.198227,float64,976
4,floor,167,0.004379,float64,43
5,max_floor,9572,0.251016,float64,51
6,material,9572,0.251016,float64,7
7,build_year,14654,0.384287,float64,128
8,num_room,9572,0.251016,float64,14
9,kitch_sq,9572,0.251016,float64,282


In [5]:
# drop the column that has too many missing values
drops = missing[missing['ratio'] > 0.05]['feature'].values
data = data.drop(drops, axis=1)
data.head()

Unnamed: 0,id,timestamp,full_sq,floor,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,1,2011-08-20,43.0,4.0,Investment,Bibirevo,6407578.0,155572,0.189727,7e-05,...,40,9,4,0,13,22,1,0,52,4
1,2,2011-08-23,34.0,3.0,Investment,Nagatinskij Zaton,9589337.0,115352,0.372602,0.049637,...,36,15,3,0,15,29,1,10,66,14
2,3,2011-08-27,43.0,2.0,Investment,Tekstil'shhiki,4808270.0,101708,0.11256,0.118537,...,25,10,3,0,11,27,0,4,67,10
3,4,2011-09-01,89.0,9.0,Investment,Mitino,12583540.0,178473,0.194703,0.069753,...,15,11,2,1,4,4,0,0,26,3
4,5,2011-09-05,77.0,4.0,Investment,Basmannoe,8398461.0,108171,0.015234,0.037316,...,552,319,108,17,135,236,2,91,195,14


In [7]:
data.isnull().sum().reset_index()

Unnamed: 0,index,0
0,id,0
1,timestamp,0
2,full_sq,0
3,floor,167
4,product_type,33
5,sub_area,0
6,area_m,0
7,raion_popul,0
8,green_zone_part,0
9,indust_part,0
