In [1]:
import numpy as np
import pandas as pd

#### Задание

Ваша задача этого соревнования - предсказать цены на квартиры в датасете test.csv. Вам будут даны два датасета: train.csv (только признаки и цены на квартиры) и test.csv (только признаки).

#### Описание файлов

- **train.csv** - тренировочный набоp  
- **test.csv** - тестовый набор  
- **sample_submission.csv** - файл с отправкой образца в правильном формате

#### Поля данных

- **Id** - идентификационный номер квартиры  
- **DistrictId** - идентификационный номер района  
- **Rooms** - количество комнат
- **Square** - площадь  
- **LifeSquare** - жилая площадь  
- **KitchenSquare** - площадь кухни  
- **Floor** - этаж  
- **HouseFloor** - количество этажей в доме  
- **HouseYear** - год постройки дома  
- **Ecology_1, Ecology_2, Ecology_3** - экологические показатели местности  
- **Social_1, Social_2, Social_3** - социальные показатели местности  
- **Healthcare_1, Helthcare_2** - показатели местности, связанные с охраной здоровьем  
- **Shops_1, Shops_2** - показатели, связанные с наличием магазинов, торговых центров  
- **Price** - цена квартиры

In [2]:
TRAIN_DATASET_PATH = 'realestatepriceprediction/train.csv'
TEST_DATASET_PATH = 'realestatepriceprediction/test.csv'
PREPARED_TRAIN_DATASET_PATH = 'realestatepriceprediction/prepared_train.csv.csv'

In [3]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)

In [4]:
df_train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [5]:
df_train.shape

(10000, 20)

#### Приведение типов данных

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [7]:
df_train['Id'] = df_train['Id'].astype(str)

In [8]:
df_train['Id'].dtypes

dtype('O')

In [9]:
df_num_features = df_train.select_dtypes(include=['float64', 'int64'])
df_num_features.head()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
0,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,33,7976,5,,0,11,184966.93073
1,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,46,10309,1,240.0,1,16,300009.450063
2,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,7759,0,229.0,1,3,220925.908524
3,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,5735,3,1084.0,0,5,175616.227217
4,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,5776,1,2078.0,2,4,150226.531644


In [10]:
df_obj_features = df_train.select_dtypes(include='object')
df_obj_features.head()

Unnamed: 0,Id,Ecology_2,Ecology_3,Shops_2
0,14038,B,B,B
1,15053,B,B,B
2,4765,B,B,B
3,5809,B,B,B
4,10783,B,B,B


In [11]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DistrictId,10000.0,50.4008,43.587592,0.0,20.0,36.0,75.0,209.0
Rooms,10000.0,1.8905,0.839512,0.0,1.0,2.0,2.0,19.0
Square,10000.0,56.315775,21.058732,1.136859,41.774881,52.51331,65.900625,641.0652
LifeSquare,7887.0,37.199645,86.241209,0.370619,22.769832,32.78126,45.128803,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0


In [12]:
df_obj_features['Ecology_2'].value_counts()

B    9903
A      97
Name: Ecology_2, dtype: int64

In [13]:
df_obj_features['Ecology_3'].value_counts()

B    9725
A     275
Name: Ecology_3, dtype: int64

In [14]:
df_obj_features['Shops_2'].value_counts()

B    9175
A     825
Name: Shops_2, dtype: int64

In [15]:
df_train.isnull().sum()

Id                  0
DistrictId          0
Rooms               0
Square              0
LifeSquare       2113
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Healthcare_1     4798
Helthcare_2         0
Shops_1             0
Shops_2             0
Price               0
dtype: int64

#### Обработка пропусков

In [16]:
# Заменим null в признаке LifeSquare на значение Square - 20. 
# Приблизительная разница между этими параметрами. Видно по таблице.
df_train.loc[df_train['LifeSquare'].isnull(), 'LifeSquare'] = df_train['Square'] - 20

In [17]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DistrictId,10000.0,50.4008,43.587592,0.0,20.0,36.0,75.0,209.0
Rooms,10000.0,1.8905,0.839512,0.0,1.0,2.0,2.0,19.0
Square,10000.0,56.315775,21.058732,1.136859,41.774881,52.51331,65.900625,641.0652
LifeSquare,10000.0,37.865874,77.315863,-2.585819,22.931603,33.3019,45.835528,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0


In [18]:
df_train['Healthcare_1'].value_counts()

540.0     511
30.0      348
1046.0    245
750.0     163
229.0     148
         ... 
370.0      14
32.0       12
1815.0     10
35.0        2
0.0         1
Name: Healthcare_1, Length: 79, dtype: int64

In [19]:
# Заменим null в признаке Healthcare_1 на медиану
df_train.loc[df_train['Healthcare_1'].isnull(), 'Healthcare_1'] = df_train['Healthcare_1'].median()

In [20]:
df_train['Healthcare_1'].value_counts()

900.0     4869
540.0      511
30.0       348
1046.0     245
750.0      163
          ... 
370.0       14
32.0        12
1815.0      10
35.0         2
0.0          1
Name: Healthcare_1, Length: 79, dtype: int64

In [21]:
df_train.isnull().sum()

Id               0
DistrictId       0
Rooms            0
Square           0
LifeSquare       0
KitchenSquare    0
Floor            0
HouseFloor       0
HouseYear        0
Ecology_1        0
Ecology_2        0
Ecology_3        0
Social_1         0
Social_2         0
Social_3         0
Healthcare_1     0
Helthcare_2      0
Shops_1          0
Shops_2          0
Price            0
dtype: int64

#### Обработка выбросов

In [22]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DistrictId,10000.0,50.4008,43.587592,0.0,20.0,36.0,75.0,209.0
Rooms,10000.0,1.8905,0.839512,0.0,1.0,2.0,2.0,19.0
Square,10000.0,56.315775,21.058732,1.136859,41.774881,52.51331,65.900625,641.0652
LifeSquare,10000.0,37.865874,77.315863,-2.585819,22.931603,33.3019,45.835528,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0


In [23]:
# Проверим количество комнат
df_train['Rooms'].value_counts()

2.0     3880
1.0     3705
3.0     2235
4.0      150
5.0       18
0.0        8
10.0       2
19.0       1
6.0        1
Name: Rooms, dtype: int64

Есть несколько домов с количеством комнат, отличающимся от остальных. Проверим все дома с количеством комнат больше 4.

In [24]:
df_train.loc[df_train['Rooms'] > 4]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
377,5927,57,10.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,B,B,74,19083,2,900.0,5,15,B,317265.323792
1422,4282,195,5.0,95.617533,62.377052,7.0,7,7.0,1940,0.015017,B,B,17,3079,59,2890.0,0,8,B,479525.147323
1454,8491,1,19.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,B,B,1,264,0,900.0,0,1,B,78364.616704
1503,1243,86,5.0,108.057398,82.723825,8.0,3,5.0,1942,0.161976,B,A,31,7010,5,4508.0,3,7,B,310305.074733
1982,5548,86,5.0,275.645284,233.949309,26.0,12,37.0,2011,0.161976,B,A,31,7010,5,4508.0,3,7,B,455264.882666
2170,14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,B,B,66,10573,1,1322.0,3,8,B,229661.964416
3268,8901,34,5.0,121.577423,114.913843,1.0,25,1.0,1977,0.069753,B,B,53,13670,4,900.0,1,11,B,455446.393758
3271,6358,27,5.0,42.003293,22.003293,0.0,5,12.0,1977,0.011654,B,B,4,915,0,900.0,0,0,B,112693.641769
4431,7594,53,5.0,39.906082,39.394024,1.0,5,1.0,1977,0.049637,B,B,34,7759,0,229.0,1,3,B,206148.845716
4487,13434,13,5.0,183.382273,100.985505,15.0,2,6.0,1997,0.090799,B,B,74,19083,2,900.0,5,15,B,622888.825328


Дома с количеством комнат 6 10 и 19 явно не соответствуют их площадям.  
Вычислим среднюю площадь одной комнаты и заменим количество комнат в этих домах.

In [25]:
Square_median = df_train['Square'].mean() / df_train['Rooms'].median()
Square_median

28.15788759896778

In [26]:
df_train.loc[df_train['Rooms'] > 5, 'Rooms'] = df_train['Square'] / Square_median

In [27]:
df_train['Rooms'].value_counts()

2.000000    3880
1.000000    3705
3.000000    2235
4.000000     150
5.000000      18
0.000000       8
2.161784       1
1.491804       1
2.110042       1
2.097351       1
Name: Rooms, dtype: int64

8 квартир имеют количество комнат 0. Посмотрим, что это за квартиры.

In [28]:
df_train.loc[df_train['Rooms'] == 0]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1397,12638,27,0.0,138.427694,136.215499,0.0,4,3.0,2016,0.075424,B,B,11,3097,0,900.0,0,0,B,268394.744389
1981,7917,27,0.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,B,B,9,1892,0,900.0,0,1,B,302211.260887
2269,7317,27,0.0,41.790881,21.790881,0.0,13,0.0,1977,0.211401,B,B,9,1892,0,900.0,0,1,B,98129.976788
3911,770,28,0.0,49.483501,29.483501,0.0,16,0.0,2015,0.118537,B,B,30,6207,1,1183.0,1,0,B,217009.338463
4366,456,6,0.0,81.491446,61.491446,0.0,4,0.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B,212864.799112
4853,3224,27,0.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,B,B,2,469,0,900.0,0,0,B,126596.941798
6149,3159,88,0.0,38.697117,19.345131,9.0,9,16.0,1982,0.127376,B,B,43,8429,3,900.0,3,9,B,158998.110646
8834,9443,27,0.0,87.762616,85.125471,0.0,5,15.0,1977,0.211401,B,B,9,1892,0,900.0,0,1,B,219281.918007


Одина из квартир имеет площадь 2.377248, что не реально. Для остальных квартир Рассчитаем количество комнат исходя из площади и медианы.

In [29]:
df_train.loc[(df_train['Rooms'] == 0) & (df_train['Square'] > Square_median), 'Rooms'] = df_train['Square'] / Square_median

In [30]:
df_train['Rooms'].value_counts()

2.000000    3880
1.000000    3705
3.000000    2235
4.000000     150
5.000000      18
2.097351       1
0.000000       1
1.484163       1
7.562086       1
2.161784       1
3.116804       1
2.110042       1
1.757358       1
1.374290       1
4.916125       1
1.491804       1
2.894089       1
Name: Rooms, dtype: int64

Поскольку комнаты не могут быть дробными - округлим их.

In [31]:
df_train['Rooms'] = np.around(df_train['Rooms'])
df_train['Rooms'].value_counts()

2.0    3884
1.0    3708
3.0    2237
4.0     150
5.0      19
0.0       1
8.0       1
Name: Rooms, dtype: int64

Заменим значение 8 комнат на 5, поскольку квартиры с похожей площадью имеют 5 комнат.

In [32]:
df_train.loc[df_train['Rooms'] == 8, 'Rooms'] = 5
df_train['Rooms'].value_counts()

2.0    3884
1.0    3708
3.0    2237
4.0     150
5.0      20
0.0       1
Name: Rooms, dtype: int64

Осталась одна квартира с маленькой площадью и 0 комнат. Заменим 0 на 1 и проверим есть ли еще квартиры с маленькой площадью.

In [33]:
df_train.loc[df_train['Rooms'] == 0, 'Rooms'] = 1
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DistrictId,10000.0,50.4008,43.587592,0.0,20.0,36.0,75.0,209.0
Rooms,10000.0,1.8888,0.812589,1.0,1.0,2.0,2.0,5.0
Square,10000.0,56.315775,21.058732,1.136859,41.774881,52.51331,65.900625,641.0652
LifeSquare,10000.0,37.865874,77.315863,-2.585819,22.931603,33.3019,45.835528,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0


In [34]:
# Найдем многокомнатные квартиры с площадью меньше медианы
df_train.loc[(df_train['Square'] < Square_median) & (df_train['Rooms'] > 1)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
212,1748,88,2.0,5.497061,67.628717,1.0,24,22.0,1977,0.127376,B,B,43,8429,3,900.0,3,9,B,412511.088764
4739,12676,81,3.0,13.784865,15.988889,7.0,4,5.0,1960,0.319809,B,B,25,4756,16,2857.0,5,8,B,78388.806186
4900,4504,27,3.0,4.390331,5.610772,1.0,8,19.0,2016,0.211401,B,B,9,1892,0,900.0,0,1,B,161379.067034
5617,15877,57,3.0,16.547242,16.698571,8.0,5,5.0,1962,0.133215,B,B,49,11395,3,1406.0,3,4,A,77835.185808
6945,12927,62,2.0,17.414181,-2.585819,1.0,4,20.0,1977,0.072158,B,B,2,629,1,900.0,0,0,A,180750.471749
8030,13265,1,3.0,4.823679,79.767964,0.0,6,17.0,1977,0.007122,B,B,1,264,0,900.0,0,1,B,237716.681261


In [35]:
# Заменим количество комнат на 1
df_train.loc[(df_train['Square'] < Square_median) & (df_train['Rooms'] > 1), 'Rooms'] = 1

In [36]:
# Сделаем предположение, что где площадь квартиры меньше 10 - ошиблись в порядке и умножим эти данные на 10
df_train.loc[df_train['Square'] < 10, 'Square'] = df_train['Square'] * 10

In [37]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DistrictId,10000.0,50.4008,43.587592,0.0,20.0,36.0,75.0,209.0
Rooms,10000.0,1.8878,0.812574,1.0,1.0,2.0,2.0,5.0
Square,10000.0,56.355745,20.995512,11.368588,41.784623,52.519197,65.900625,641.0652
LifeSquare,10000.0,37.865874,77.315863,-2.585819,22.931603,33.3019,45.835528,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0


In [38]:
# Максимальная площадь 641, что выбивается из общей картины. Посмотрим квартиры с площадью больше 200
df_train.loc[df_train['Square'] > 200]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1981,7917,27,5.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,B,B,9,1892,0,900.0,0,1,B,302211.260887
1982,5548,86,5.0,275.645284,233.949309,26.0,12,37.0,2011,0.161976,B,A,31,7010,5,4508.0,3,7,B,455264.882666
4262,28,9,2.0,604.705972,584.705972,1.0,17,18.0,1977,0.161532,B,B,25,5648,1,30.0,2,4,B,187717.242538
4690,2307,102,1.0,409.425181,410.639749,10.0,4,4.0,2016,0.238617,B,B,26,3889,6,705.0,3,6,B,90470.43083
6977,11602,30,2.0,641.065193,638.163193,10.0,20,19.0,2019,7.8e-05,B,B,22,6398,141,1046.0,3,23,B,133529.681562
9910,16568,27,4.0,200.334539,201.627361,25.0,1,2.0,2013,0.041116,B,B,53,14892,4,900.0,1,4,B,528560.506016


In [39]:
# В квартирах с площадью больше 400 заменим количество комнат на 6
df_train.loc[df_train['Square'] > 400, 'Rooms'] = 6

In [41]:
# Проверим, что замена произошла
df_train['Rooms'].value_counts()

2.0    3880
1.0    3714
3.0    2233
4.0     150
5.0      20
6.0       3
Name: Rooms, dtype: int64

In [44]:
# Минимальная LifeSquare отрицательная. Посмотрим квартиры у которых LifeSquare меньше 10, а Square больше 20
df_train.loc[(df_train['LifeSquare'] < 10) & (df_train['Square'] > 100)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
26,4378,27,3.0,106.958871,0.641822,0.0,17,0.0,2018,0.072158,B,B,2,629,1,900.0,0,0,A,337299.867936
751,11989,62,3.0,104.446306,1.451067,1.0,12,15.0,2015,0.072158,B,B,2,629,1,900.0,0,0,A,223692.67248
1918,13603,63,1.0,127.651237,2.609712,1.0,8,17.0,2016,0.238617,B,B,26,3889,6,705.0,3,6,B,594565.29977
3547,14398,54,3.0,113.329819,5.949812,1.0,10,1.0,2014,0.006076,B,B,30,5285,0,645.0,6,6,B,585031.267181
4514,4680,54,3.0,107.558419,4.110708,1.0,13,1.0,2016,0.006076,B,B,30,5285,0,645.0,6,6,B,577122.841727
9813,2040,62,1.0,122.438148,5.751401,1.0,2,3.0,1977,0.072158,B,B,2,629,1,900.0,0,0,A,224900.897679
9828,13630,24,3.0,127.141143,4.811405,1.0,3,1.0,2017,0.111627,B,B,50,12238,8,1970.0,2,3,B,611250.912034


In [45]:
# удалим столбец id
df_train = df_train[df_train.columns[:-1]]

In [46]:
df_train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,900.0,0,11,B
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B
