In [1]:
import pandas as pd

In [2]:
TARGET_COLUMN = 'per_square_meter_price'

raw_df = pd.read_csv('./data/raw/train.csv')
print(raw_df.shape)
raw_df.head(10)

(206474, 76)


Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type
0,Ставрополь,,COL_65683,45.011356,41.919273,7,71,144,225,0,...,969.0,8.439655,9.25,1990.232759,1993.5,Ставропольский край,31.0,S1842,2020-02-23,10
1,Омск,,COL_159011,54.984833,73.387344,4,40,109,206,0,...,305.0,4.75641,4.333333,1960.346154,1956.291667,Омская область,150.0,S14290,2020-05-24,10
2,Белгород,,COL_205850,50.576805,36.563267,0,9,22,44,0,...,533.0,6.435897,6.4,1980.452991,1989.633333,Белгородская область,73.0,S25733,2020-06-28,10
3,Большой Исток,,COL_229491,56.714413,60.792454,0,0,0,1,0,...,,,,,,Свердловская область,438.0,S6985,2020-07-19,110
4,Новосибирск,1.0,COL_77283,55.0552,82.89581,0,25,50,101,0,...,1188.0,6.830065,6.672414,1974.045752,1974.637931,Новосибирская область,62.7,S4778,2020-03-01,10
5,Красноярск,1.0,COL_237766,56.054585,92.92368,3,15,39,66,0,...,488.0,11.113924,11.0,2004.949367,2010.0625,Красноярский край,84.0,S2682,2020-07-26,100
6,Москва,2.0,COL_188715,55.861176,37.549427,1,39,96,133,0,...,16.0,14.909091,16.0,1994.272727,1969.0,Москва,623.0,S7194,2020-06-14,110
7,Волгоград,,COL_77130,48.67651,44.472553,16,84,90,97,0,...,1063.0,8.827586,9.148936,1983.086207,1981.659574,Волгоградская область,70.0,S26181,2020-03-01,10
8,Белгород,1.0,COL_216454,50.55753,36.56569,1,27,65,85,0,...,1750.0,9.971429,10.66,1999.809524,1999.36,Белгородская область,68.0,S4639,2020-07-05,110
9,Иркутск,,COL_262955,52.27025,104.310104,0,26,66,97,0,...,1073.0,6.338843,6.073529,1971.798354,1977.470588,Иркутская область,608.0,S19079,2020-08-16,10


In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206474 entries, 0 to 206473
Data columns (total 76 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   city                                 206474 non-null  object 
 1   floor                                76209 non-null   float64
 2   id                                   206474 non-null  object 
 3   lat                                  206474 non-null  float64
 4   lng                                  206474 non-null  float64
 5   osm_amenity_points_in_0.001          206474 non-null  int64  
 6   osm_amenity_points_in_0.005          206474 non-null  int64  
 7   osm_amenity_points_in_0.0075         206474 non-null  int64  
 8   osm_amenity_points_in_0.01           206474 non-null  int64  
 9   osm_building_points_in_0.001         206474 non-null  int64  
 10  osm_building_points_in_0.005         206474 non-null  int64  
 11  osm_building_

In [4]:
import numpy as np


def preprocess(dataset):
    dataset = dataset.drop(columns=[
        "floor",  # too many NaNs
        "street",  # too complex to analyze
        "id",  # the same
    ])
    dataset.osm_city_nearest_population = dataset.osm_city_nearest_population.fillna(
        value=4.438900e+04  # if we don't know - city is small
    )
    for feature in [
        'reform_house_population_1000',
        'reform_house_population_500',
        'reform_mean_floor_count_1000',
        'reform_mean_floor_count_500',
        'reform_mean_year_building_1000',
        'reform_mean_year_building_500',
    ]:
        dataset[feature].fillna(dataset[feature].dropna().mean(), inplace=True)

    population = dataset.osm_city_nearest_population
    dataset = dataset.assign(
        is_near_to_moscow=(dataset.osm_city_nearest_name == 'Москва').astype(int),
        is_near_to_spb=(dataset.osm_city_nearest_name == "Санкт-Петербург").astype(int),

        is_moscow=(dataset.city == 'Москва').astype(int),
        is_spb=(dataset.city == 'Санкт-Петербург').astype(int),
        is_krr=(dataset.city == 'Краснодар').astype(int),
        is_ekb=(dataset.city == 'Екатеринбург').astype(int),

        is_large_city=(1.468833e+06 <= population).astype(int),
        is_medium_city=((1.013468e+06 <= population) & (population < 1.468833e+06)).astype(int),
        is_small_city=((2.622500e+05 <= population) & (population < 1.013468e+06)).astype(int),
        is_extra_small_city=(population <= 2.622500e+05).astype(int),

        reform_mean_age_building_500=(2020 - dataset.reform_mean_year_building_500),
        reform_mean_age_building_1000=(2020 - dataset.reform_mean_year_building_1000),
    )

    dataset = dataset.drop(columns=["city", "region", 'osm_city_nearest_name'])  # str

    dataset.date = (pd.to_datetime(dataset.date) - pd.to_datetime('2020-01-01')) / np.timedelta64(1, 'D')

    return dataset


df = preprocess(raw_df)

In [5]:
import sklearn.preprocessing as preprocessing

scaler = preprocessing.MinMaxScaler()
features = df.drop(columns=[TARGET_COLUMN]).columns
scaler.fit(df[features])
df = pd.DataFrame(scaler.transform(df[features]), index=df.index, columns=features)
del features
df[TARGET_COLUMN] = raw_df[TARGET_COLUMN]
df.head(10)

Unnamed: 0,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,...,is_spb,is_krr,is_ekb,is_large_city,is_medium_city,is_small_city,is_extra_small_city,reform_mean_age_building_500,reform_mean_age_building_1000,per_square_meter_price
0,0.087908,0.167018,0.152174,0.156388,0.169412,0.161638,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.013125,0.014255,17645.16129
1,0.459494,0.40562,0.086957,0.088106,0.128235,0.147989,0.0,0.001706,0.001054,0.000864,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.031554,0.029065,33166.666667
2,0.295262,0.126406,0.0,0.019824,0.025882,0.031609,0.0,0.001706,0.001054,0.003457,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.01504,0.019102,22493.150685
3,0.523934,0.310121,0.0,0.0,0.0,0.000718,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.025747,0.025477,79509.13242
4,0.462116,0.477717,0.0,0.055066,0.058824,0.072557,0.0,0.0,0.0,0.000864,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.022468,0.022277,126937.799043
5,0.49935,0.553752,0.065217,0.03304,0.045882,0.047414,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.004922,0.006963,53297.619048
6,0.492145,0.133884,0.021739,0.085903,0.112941,0.095546,0.0,0.001706,0.002107,0.001729,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.02526,0.012253,144773.675762
7,0.224462,0.186378,0.347826,0.185022,0.105882,0.069684,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.01899,0.017797,73914.285714
8,0.294544,0.126425,0.021739,0.059471,0.076471,0.061063,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.010223,0.00951,36573.529412
9,0.358356,0.640088,0.0,0.057269,0.077647,0.069684,0.0,0.0,0.002107,0.002593,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.021065,0.02339,29457.236842


In [6]:
rows_list = []
for column in df.columns:
    try:
        corr = df[TARGET_COLUMN].corr(df[column])
        var = df[column].var()
        rows_list.append({
            'feature': column,
            'corr': corr,
        })
    except TypeError:
        continue
pd.DataFrame(rows_list).sort_values('corr')

Unnamed: 0,feature,corr
49,osm_subway_closest_dist,-0.272886
1,lng,-0.223485
77,is_small_city,-0.202750
76,is_medium_city,-0.172409
14,osm_city_closest_dist,-0.137331
...,...,...
11,osm_catering_points_in_0.005,0.496335
71,is_moscow,0.542487
69,is_near_to_moscow,0.546118
15,osm_city_nearest_population,0.551361


In [7]:
def get_well_correlated_features(minimum):
    return [info['feature'] for info in rows_list if abs(info['corr']) >= minimum and info['feature'] != TARGET_COLUMN]

In [8]:
test_x = preprocess(pd.read_csv('./data/raw/test_x.csv'))
test_x = pd.DataFrame(scaler.transform(test_x), index=test_x.index, columns=test_x.columns)

In [9]:
def post_process(y, divisor: float):
    y[y < 0] = 0
    y = y / divisor
    return y

In [10]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

def try_fit_with(features_list, divisor):
    selected_features = df[features_list]
    target = df[TARGET_COLUMN]

    total_errors = {'MAPE': 0}
    for train, test in KFold(n_splits=5, shuffle=True).split(selected_features, target):
        model = LinearRegression()
        model.fit(selected_features.loc[train], target.loc[train])
        predicted = post_process(model.predict(selected_features.loc[test]), divisor)
        total_errors['MAPE'] += mean_absolute_percentage_error(target.loc[test], predicted)
    return total_errors


def try_fit_several_times(features_list, n: int = 3, divisor: float = 8.5):
    total_errors = {'MAPE': 0}
    for i in range(n):
        current_result = try_fit_with(features_list, divisor)
        total_errors['MAPE'] += current_result['MAPE'] / n
    model = LinearRegression().fit(df[features_list], df[TARGET_COLUMN])
    predicted = post_process(model.predict(test_x[features_list]), divisor)
    result = pd.DataFrame(predicted, columns=[TARGET_COLUMN])
    result.index.name = 'id'
    return total_errors, result

In [11]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.0))
print(errors)
prediction.to_csv('data/submission.v1.csv')
prediction.describe()

{'MAPE': 4.530448655403449}


Unnamed: 0,per_square_meter_price
count,68825.0
mean,13090.840269
std,13812.523752
min,0.0
25%,4480.872725
50%,7278.901286
75%,17242.867033
max,97194.360879


In [12]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.1))
print(errors)
prediction.to_csv('data/submission.v2.csv')

{'MAPE': 4.559169144592245}


In [13]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.2))
print(errors)
prediction.to_csv('data/submission.v3.csv')

{'MAPE': 4.5502913240732195}


In [14]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.3))
print(errors)
prediction.to_csv('data/submission.v4.csv')

{'MAPE': 4.542648259371863}


In [15]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.35))
print(errors)
prediction.to_csv('data/submission.v5.csv')

{'MAPE': 4.538590085550156}


In [16]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.4))
print(errors)
prediction.to_csv('data/submission.v6.csv')

{'MAPE': 4.5352134995323805}


In [17]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.45))
print(errors)
prediction.to_csv('data/submission.v7.csv')

{'MAPE': 4.544508311053708}


In [18]:
errors, prediction = try_fit_several_times(get_well_correlated_features(0.5))
print(errors)
prediction.to_csv('data/submission.v8.csv')

{'MAPE': 4.55174904746275}


In [19]:
%%timeit

from skfeature.function.statistical_based import CFS

x = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]
cfs_based_choice =  CFS.cfs(x.values, y.values)
print(cfs_based_choice)

[66 64 14 30 26 31 79 80 16 24 39  2  5 51 18 11  8 10 53 29 43 28 44 25
  3 67 46 52 20 34 78 45 35  4 57 72  1 42 59 13 71 56 54 68 15 75 37 58
 27 62 74 40  6 22 63 69 77 17  7 41 48 47 55 49 65  0 60 73 70 23 61 33
 32 21 38 50 76 36  9 12 19]
[66 64 14 30 26 31 79 80 16  6 65 41 43 47  2 45  3 76 67 61  7 12 48  5
 28 29 60 73 69 68 22 75  0 23 77 37 62 71 34 51 70  8 58 74 46 27 57 55
 78 36  4 53 25 18 35 44 59 52 49 15 24 56 40 32 20 39 63 21 13 42  1 54
 33 72  9 50 10 19 17 38 11]
[66 64 14 30 26 31 79 80 16 57  7 73 74 24 62 11 77 19 67 60 55 21 15  6
 68 69 52 42 39 76 59 71 45 50 13 53 54 41 38 28 70 32  9 40 22  0 33 20
 58 75 18 48 65  3 47 61 51  5 37  8 44 29 46 25 35 63 43 49  2 23 36 27
 12 72 34  4 56 17 78  1 10]
[66 64 14 30 26 31 79 80 16 41 20 18 68 54  4 56 71 11 32 23 47 21 27 36
 67 55 46 34 62 76 24 58  2 53  5 43 74 40 69 48  0  8 38 61 10  6 22 63
 52 70 72 13 59 45 15 77 39 73  1 35 25  9 12 28 37 49 29 17 19 57  3 50
  7 33 75 60 42 78 65 51 44]


KeyboardInterrupt: 

In [22]:
print(len([66, 64, 14, 30, 26, 31, 79, 80, 16, 24, 39,  2,  5, 51, 18, 11,  8, 10, 53, 29, 43, 28, 44, 25,
           3, 67, 46, 52, 20, 34, 78, 45, 35,  4, 57, 72,  1, 42, 59, 13, 71, 56, 54, 68, 15, 75, 37, 58,
           27, 62, 74, 40,  6, 22, 63, 69, 77, 17,  7, 41, 48, 47, 55, 49, 65,  0, 60, 73, 70, 23, 61, 33,
           32, 21, 38, 50, 76, 36,  9, 12, 19]))
print(len(x.columns))

81


NameError: name 'x' is not defined