In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import sklearn as sk
from xgboost import XGBRegressor

from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
import gc
import warnings
warnings.filterwarnings('ignore')

In [104]:
def preprocess(df):

    del df['Код']
    del df['Адрес']
    del df['Год постройки']
    del df['Объект']
    del df['Поселение']
    
    #Replace NaN values
    for col in df.columns.tolist():
        #print(col)
        if df[col].dtype == object:
            df[col] = df[col].fillna(df[col].mode()[0])
            #Get Dummies
            df = pd.get_dummies(df, columns=[col])
        else:
            df[col] = df[col].fillna(df[col].mean())   
           
        print("done col=",col)
    
   
    
    
    return df


# Reducing mem usage by changing datatypes

def chage_dtypes(df):
    for col in df.columns.tolist():
        if df[col].dtype == 'float64':
            df[col].dtype =='float16'
    return df

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df



def na_dropper(data):
    tot = data.shape[0]
    for col in data.columns:
        mis = data[col].isna().sum()
        if ((mis/tot) > 0.60) and ('transactionRevenue' not in col): # quick escape from making a mistake
            print("The column {} will be dropped because more than 60% of the entries are missing".format(col))
            del data[col]
    return data


def std_dropper(data,target_column):
    data.reset_index()
    std = data[target_column].std()
    median = data[target_column].median()
    print('rows berofe dropping {}'.format(data.shape[0]))
    print('lower',median-0.5*median)
    print('upper',median+0.5*median)
    
    print('lower indexes',data[data[target_column]<median-0.5*median].index)
    print('upper indexes',data[data[target_column]<median-0.5*median].index)
    data = data[data[target_column]>median-0.5*median]

    data = data[data[target_column]<median+0.5*median]
    print('rows after dropping {}'.format(data.shape[0]))
    return data
    






In [89]:
train = pd.read_csv('new_train.csv')


In [90]:
#Data preprocessing

train = na_dropper(train)
train = preprocess(train)

The column Посел. район will be dropped because more than 60% of the entries are missing
The column Тип санузла will be dropped because more than 60% of the entries are missing
done col= Терр. единица
done col= Терр. район
done col= Округ
done col= Улица
done col= Дата регистрации
done col= Цена
done col= Форма собственности
done col= Назначение
done col= Форма использования
done col= Площадь строения
done col= Площадь зем.участка
done col= Матер. Стен
done col= Тип кровли
done col= Этаж
done col= Кол-во этажей в здании
done col= Отопление
done col= Газификация
done col= Серия квартиры
done col= Наличие бани/сауны
done col= Кол-во комнат


In [105]:
train =chage_dtypes(train)
train = std_dropper(train,'Цена')


targets = train['Цена']
del train['Цена']

rows berofe dropping 178803
lower 1263.5
upper 3790.5
lower indexes Int64Index([    29,    122,    145,    170,    172,    216,    234,    235,
               236,    241,
            ...
            178793, 178794, 178795, 178796, 178797, 178798, 178799, 178800,
            178801, 178802],
           dtype='int64', length=62419)
upper indexes Int64Index([    29,    122,    145,    170,    172,    216,    234,    235,
               236,    241,
            ...
            178793, 178794, 178795, 178796, 178797, 178798, 178799, 178800,
            178801, 178802],
           dtype='int64', length=62419)
rows after dropping 45233


In [106]:
x_train,x_test,y_train,y_test = sk.model_selection.train_test_split(train,targets)
del train 
del targets

In [107]:
#feat selection
from sklearn.feature_selection import SelectFromModel
lr = sk.linear_model.LinearRegression()
lr.fit(x_train,y_train)
selector = SelectFromModel(estimator=lr)
selector.fit(x_train,y_train)
print('Before feature selection',x_train.shape[1])
x_train =selector.transform(x_train)
print('after feature selection',x_train.shape[1])    
x_test = selector.transform(x_test)
print('Estimator coefs',lr.coef_)
print('Threshold',selector.threshold_)

Before feature selection 4720
after feature selection 239
Estimator coefs [-1.23549541e-04 -1.25471319e-06 -8.38180365e-01 ...  3.05606719e+07
  3.05606344e+07  3.05606718e+07]
Threshold 626737.1846924172


In [108]:
print(x_train.shape)

(33924, 239)


In [173]:
#Building model
from sklearn.ensemble import RandomForestRegressor
model = sk.ensemble.AdaBoostRegressor(
                                            n_estimators=55
                                              #,n_jobs=-1
                                             )
model.fit(x_train,y_train)
pred = model.predict(x_test)


In [174]:
print('r2 score',sk.metrics.r2_score(y_test,pred))

r2 score 0.056872057259557796


In [175]:
print(pred)

[2343.47803928 2610.6822753  2343.47803928 ... 2563.23248239 2343.47803928
 2437.74445101]


In [176]:
print(y_test)

         Цена
0      1444.0
1      2888.0
2      2707.0
3      1386.0
4      1624.0
...       ...
11304  2346.0
11305  1805.0
11306  2490.0
11307  2707.0
11308  2527.0

[11309 rows x 1 columns]


In [177]:
print(sk.metrics.mean_absolute_error(y_test,pred))

617.8839440021076


In [178]:
print(sk.metrics.r2_score(y_train,model.predict(x_train)))

0.056836724831332286


In [179]:
y_test.describe()

Unnamed: 0,Цена
count,11309.0
mean,2423.39305
std,740.257975
min,1267.0
25%,1805.0
50%,2346.0
75%,3018.0
max,3790.0


In [180]:
pred=pd.DataFrame(pred)
pred.describe()

Unnamed: 0,0
count,11309.0
mean,2475.956191
std,114.548484
min,2275.469952
25%,2388.358776
50%,2463.082992
75%,2542.386328
max,2918.037409


In [None]:
def hyperparameter_tuning(params):
    regr = sk.ensemble.RandomForestClassifier(n_jobs=-1)
    regr.fit(x_train,y_train)
    acc = sk.metrics.r2_score(y_test,regr.predict(x_test))
    print(acc)
    return {"loss": -acc, "status": STATUS_OK}


space = {
    "n_estimators": hp.quniform('n_estimators',100,10000,250),
    #"criterion": hp.choice("criterion",["friedman_mse","mse","mae"])

}

#Initialize trials object
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=50, 
    trials=trials
)

print("Best: {}".format(best))


  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]