In [1]:
import datetime
import gc

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

from lightgbm import LGBMRegressor

%matplotlib inline

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if (col_type != object) and (str(col_type) != 'category'):
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif col_type == object:
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
class FeatureImputer:
    """Заполнение пропусков и облработка выбросов"""
    
    def __init__(self):
        self.medians=None
        
    def fit(self, X):
        self.medians = X.median()
    
    def transform(self, X):
        
        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']
        
        # KitchenSquare
        X.loc[X['KitchenSquare'] < 5, 'KitchenSquare'] = 0
        X.loc[X['KitchenSquare'] > 300, 'KitchenSquare'] = self.medians['KitchenSquare']
        
        # Square
        X.loc[(X['Square'] < 12) & (X['LifeSquare'] > 12), 'Square'] =\
            (X.loc[(X['Square'] < 12) & (X['LifeSquare'] > 12), 'LifeSquare'] +\
            X.loc[(X['Square'] < 12) & (X['LifeSquare'] > 12), 'KitchenSquare'])
        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[(X['HouseFloor'] > 50) | (X['HouseYear'] < 1900) | (X['HouseYear'] > 2020), 'HouseYear_outlier'] = 1
        X.loc[(X['HouseFloor'] > 50) | ((X['HouseFloor'] <= 1) & (X['Floor'] > 1)), 'HouseFloor'] = self.medians['HouseFloor']
        X.loc[(X['HouseFloor'] <= 1) & (X['Floor'] <= 1), 'HouseFloor'] = 1
        X.loc[(X['HouseFloor'] < X['Floor']), 'HouseFloor'] = X.loc[(X['HouseFloor'] < X['Floor']), 'HouseFloor'] +\
            X.loc[(X['HouseFloor'] < X['Floor']), 'Floor']
        
        # HouseYear
        X['HouseYear_outlier'] = 0
        X.loc[(X['HouseYear'].isna()) | (X['HouseYear'] < 1900) | (X['HouseYear'] > 2020), 'HouseYear_outlier'] = 1
        X.loc[X['HouseYear'].isna() | (X['HouseYear'] < 1900) | (X['HouseYear'] > 2020), 'HouseYear'] = self.medians['HouseYear']
        
        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)
            
        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        
        condition = (X['LifeSquare'].isna()) &\
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare']
        
        X.loc[X['LifeSquare'] > X['Square'], 'LifeSquare'] =\
            (X.loc[X['LifeSquare'] > X['Square'], 'Square'] - X.loc[X['LifeSquare'] > X['Square'], 'KitchenSquare'])
        
        return X

In [2]:
class FeatureGenetator():
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.med_price_by_district = None
        self.average_price_per_square_meter = None
        
    def fit(self, X, y=None):
        
        X = X.copy()
        
        # DistrictID
        district = X['DistrictId'].value_counts()
        
        district = district[district > 50] 
        self.DistrictId_counts = district.to_dict()
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        # Target encoding
        ## District
        df = X.copy()     
        
        if y is not None:
            df['Price'] = y.values
            
            df['DistrictId_popular'] = df['DistrictId'].copy()
            df.loc[~df['DistrictId_popular'].isin(district.keys().tolist())] = np.nan
            
            self.med_price_by_district = df.groupby(['DistrictId_popular', 'Rooms'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByDistrict'})
            
            df['PricePerSquareMeter'] = df['Price'] / df['Square']
            self.average_price_per_square_meter = df.groupby('DistrictId_popular', as_index=False).\
                                                agg({'PricePerSquareMeter':'median'}).\
                                                rename(columns={'PricePerSquareMeter':'AveragePricePerSquareMeter',
                                                           'DistrictId_popular': 'DistrictId'})
            
        ## floor, year
        if y is not None:
            df['Price'] = y.values
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            self.med_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByFloorYear'})
        

        
    def transform(self, X):
        
        # DistrictId
        X['DistrictId_count'] = X['DistrictId'].map(self.DistrictId_counts)
        
        X['new_district'] = 0
        X.loc[X['DistrictId_count'].isna(), 'new_district'] = 1
        
        X['DistrictId_count'].fillna(5, inplace=True)
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # More categorical features
        X = self.floor_to_cat(X)
        X = self.year_to_cat(X) 
        
        # Target encoding
        if self.med_price_by_district is not None:
            X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')
        if self.med_price_by_floor_year is not None:
            X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
        if self.average_price_per_square_meter is not None:
            X = X.merge(self.average_price_per_square_meter, on='DistrictId', how='left')
            
        return X
    
    @staticmethod
    def floor_to_cat(X):
        
        X['floor_cat'] = np.nan
        
        X.loc[X['Floor'] < 3, 'floor_cat'] = 1  
        X.loc[(X['Floor'] >= 3) & (X['Floor'] <= 5), 'floor_cat'] = 2
        X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3
        X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4
        X.loc[(X['Floor'] > 15) & (X['Floor'] <= 24), 'floor_cat'] = 5
        X.loc[X['Floor'] > 24, 'floor_cat'] = 6  
        return X
     
    @staticmethod
    def year_to_cat(X):
        
        X['year_cat'] = np.nan
        
        X.loc[X['HouseYear'] < 1941, 'year_cat'] = 1
        X.loc[(X['HouseYear'] >= 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2
        X.loc[(X['HouseYear'] > 1945) & (X['HouseYear'] <= 1980), 'year_cat'] = 3
        X.loc[(X['HouseYear'] > 1980) & (X['HouseYear'] <= 2000), 'year_cat'] = 4
        X.loc[(X['HouseYear'] > 2000) & (X['HouseYear'] <= 2010), 'year_cat'] = 5
        X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6
            
        return X

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [6]:
train_data = reduce_mem_usage(train_data)
test_data = reduce_mem_usage(test_data)

Memory usage of dataframe is 1.53 MB
Memory usage after optimization is: 0.49 MB
Decreased by 68.1%
Memory usage of dataframe is 0.72 MB
Memory usage after optimization is: 0.21 MB
Decreased by 70.3%


In [7]:
train_data = train_data[(train_data['Square'] > 12)]

In [8]:
X = train_data.drop('Price', axis=1)
y = train_data[['Price']]

In [9]:
preds_final = pd.DataFrame()
preds_final['Id'] = test_data['Id'].copy()

In [10]:
X.set_index('Id', inplace=True)
test_data.set_index('Id', inplace=True)

In [11]:
print('Строк в трейне:' ,  X.shape[0])
print('Строк в тесте', test_data.shape[0])

# Удалим ненужные файлы
del train_data 
gc.collect()  

Строк в трейне: 9988
Строк в тесте 5000


87

In [16]:
X['DistrictId'] = X['DistrictId'].astype('category')
test_data['DistrictId'] = test_data['DistrictId'].astype('category')

In [17]:
imputer = FeatureImputer()
imputer.fit(X)

X = imputer.transform(X)
test_data = imputer.transform(test_data)

In [18]:
features = FeatureGenetator()
features.fit(X, y)

X = features.transform(X)
test_data = features.transform(test_data)

ValueError: Length of values does not match length of index

In [15]:
model = xgboost.XGBClassifier()
model.fit(X, y)

NameError: name 'xgboost' is not defined