In [310]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

In [311]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
df = pd.concat([train,test],ignore_index=True, sort=False)

In [312]:
df.tail()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
14995,8180,11,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.00017,B,B,36,5992,0,,1,1,B,
14996,4695,1,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,B,B,1,264,0,,0,1,B,
14997,5783,12,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,B,B,74,19083,2,,5,15,B,
14998,4780,62,2.0,81.305222,,0.0,4,0.0,1977,0.072158,B,B,2,629,1,,0,0,A,
14999,12504,30,2.0,60.555693,,1.0,10,17.0,1977,7.8e-05,B,B,22,6398,141,1046.0,3,23,B,


In [313]:
# some initial fixing for testing 
df[["Ecology_2","Ecology_3","Shops_2"]] = (df[["Ecology_2","Ecology_3","Shops_2"]] == "A").astype(int)
df["Healthcare_1"] = df["Healthcare_1"].fillna(df["Healthcare_1"].median())
df["Helthcare_2"] = df["Helthcare_2"].fillna(0)

In [314]:
#test learning function 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  RandomForestRegressor
from sklearn.metrics import mean_squared_error , r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler

def test_learning(data, random, percent):
    scal = MinMaxScaler()
    
    
    data = data.astype(np.float64)
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)
    data = data.fillna(data.median(axis = 1)) 
    data = pd.DataFrame(scal.fit_transform(data))
    
    X, y = data.iloc[:, :-1], data.iloc[: , -1:]
    X_train, X_test , y_train, y_test = train_test_split(X,y, test_size = percent , random_state = random)
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    linear = r2_score(y_test,lr.predict(X_test))

    lr = RandomForestRegressor(random_state=random, n_jobs = -1, max_depth=15, n_estimators=1000)
    lr.fit(X_train, y_train.values[:, 0])
    forest = r2_score(y_test,lr.predict(X_test))
    
    lr = SVR(gamma="auto")
    lr.fit(X_train, y_train.values[:, 0])
    svr = r2_score(y_test,lr.predict(X_test))
    
    print(f'LinearRegression ---{linear}')
    print(f'RandomForestRegressor ---{forest}')
    print(f'Support Vector Regression ---{svr}')


# DATA PREPARATION

### 1. Initial learning on raw data 

In [315]:
# test_learning(df[:10000], 42, 0.2)

# LinearRegression ---0.4379236902063941
# RandomForestRegressor ---0.7034980538088529
# Support Vector Regression ---0.5154912764143618

### 2. Strange values fixing

In [316]:
# Some manual fixing strange room numbers 
room_index = df[(df["Rooms"] > 5) | (df["Rooms"] == 0)].index
df.loc[room_index, "Rooms"] = [2, 6, 1, 6, 2, 1, 2, 3, 1, 1, 3, 2, 6, 6, 3, 6, 2]
df.loc[[3271, 4431, 8551, 10263, 4739, 5617 ], "Rooms"] = 1

In [317]:
# Some manual fixing strange areas by rooms
df.loc[[4690, 4262, 6977, 6945, 4328], "Square"] = [41, 60, 64, 74, 80]

In [318]:
# Floor, HouseFloor fixing
HouseF_index = df[(df["HouseFloor"] > 60)].index
df.loc[HouseF_index, "HouseFloor"] = [9, 9, 11, 9]
df.loc[14698, "Floor"] = [8]
df["Floor"] = (df["Floor"] + df["Floor"].median()).where(df["Floor"] == 0, df["Floor"].values)
df[["HouseFloor","Floor"]] = df[["Floor","HouseFloor"]] .where(df["Floor"] > df["HouseFloor"],df[["HouseFloor","Floor"]].values)
df["Floor"] = (df["Floor"] + 1).where(df["Floor"] == 0, df["Floor"].values)

In [319]:
#  HouseYear fixing
HouseY_index = df[df["HouseYear"] > 2020].index 
df.loc[HouseY_index, "HouseYear"] = [2011, 1968]

In [320]:
# Square / Lifsqure fixing
df["Square"] = (df["Square"] * 10).where(df.Square < 10, df["Square"].values)
df["LifeSquare"] = (df["LifeSquare"] * 100).where(df["LifeSquare"] < 1, df["LifeSquare"].values)
df["LifeSquare"] = (df["LifeSquare"] * 10).where(df["LifeSquare"] < 10, df["LifeSquare"].values)
df["LifeSquare"] = (df["LifeSquare"] / 10 ).where((df["LifeSquare"]>200) & (df["Rooms"]< 4),df["LifeSquare"].values)
df["LifeSquare"] = (df["LifeSquare"] / 10 ).where((df["LifeSquare"]>200) & (df["Rooms"]< 4),df["LifeSquare"].values)
diff = df[abs(df["LifeSquare"] - df["Square"]) > 80 ].index
df.loc[diff, "LifeSquare"] = [107, 145,26,100,184,16,187,145,133,176,69,175,104,49,118,138]
df.loc[[7767,1195,12095  ], "LifeSquare"] = [18,16,86]
df.loc[[1918, 13697], "Square"] = [27,33]
df["LifeSquare"] = (df["Square"] * 0.6).where(df["LifeSquare"].isnull(), df["LifeSquare"].values)
df[["LifeSquare","Square"]] = df[["Square", "LifeSquare"]] .where(df["LifeSquare"] > df["Square"],
                                                                  df[["LifeSquare", "Square"]].values)
                                                       

In [321]:
# Kitchensquare fixing
ksq_index = df[(df.KitchenSquare > df.Square)  ].index
df.loc[ksq_index, "KitchenSquare"]  = [14, 7, 8, 7, 6, 3, 4, 9]

In [322]:
# Other fixing 
df["Social_2"] = (df["Social_2"] % 10000).where(df["Social_2"] > 15000, df["Social_2"].values)
df["Social_3"] = (df["Social_3"] % 100).where(df["Social_3"] > 100, df["Social_3"].values)
df["Healthcare_1"] = (df["Healthcare_1"] // 10).where(df["Healthcare_1"] > 3000, df["Healthcare_1"].values)

### 3. Test learning on white data 

In [323]:
# test_learning(df[:10000], 42, 0.2)
# LinearRegression ---0.533895647100507
# RandomForestRegressor ---0.7104953865731296
# Support Vector Regression ---0.5866046443403021

In [324]:
#assert False

### 4. Add another features

In [325]:
# mean price feature 

rooms_means = df[:10000].groupby(["DistrictId","Rooms"], as_index= False).agg({
                  "Price": ["mean"],})
rooms_means.columns = ["DistrictId", "Rooms", "mean_price"]

In [326]:
df = df.merge(rooms_means, on = ["DistrictId","Rooms"], how = "left")

In [327]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 14999
Data columns (total 21 columns):
Id               15000 non-null int64
DistrictId       15000 non-null int64
Rooms            15000 non-null float64
Square           15000 non-null float64
LifeSquare       15000 non-null float64
KitchenSquare    15000 non-null float64
Floor            15000 non-null float64
HouseFloor       15000 non-null float64
HouseYear        15000 non-null int64
Ecology_1        15000 non-null float64
Ecology_2        15000 non-null int32
Ecology_3        15000 non-null int32
Social_1         15000 non-null int64
Social_2         15000 non-null int64
Social_3         15000 non-null int64
Healthcare_1     15000 non-null float64
Helthcare_2      15000 non-null int64
Shops_1          15000 non-null int64
Shops_2          15000 non-null int32
Price            10000 non-null float64
mean_price       14934 non-null float64
dtypes: float64(10), int32(3), int64(8)
memory usage: 2.3 MB


In [328]:
# auxullary mean price 

In [329]:
cutted = df.copy()
for j in np.unique(cutted["DistrictId"]):
    for i in np.unique(cutted[cutted["DistrictId"] == j]["Rooms"]):
        l = 20000
        cutted.loc[(cutted["Rooms"]==i) & ((cutted["DistrictId"] == j)) ,"Square"] = \
        pd.cut(cutted.loc[(cutted["Rooms"]==i) & (cutted["DistrictId"] == j) ,"Square"].values, l, labels = range(l))

In [330]:
table = cutted.groupby(["DistrictId","Rooms", "Square"], as_index= False).agg({
                  "Price": ["mean"],})

table.columns = ["DistrictId", "Rooms", "Square","mean"]

med_price  = cutted.groupby(["Rooms","Square"], as_index= False).agg({
                  "Price": ["mean"],})

med_price.columns =  ["Rooms", "Square", "Median"]

med_price1 = cutted.groupby(["Rooms"], as_index= False).agg({
                  "Price": ["median"],})

med_price1.columns =  ["Rooms", "Median1"]

In [331]:
med_price = med_price.merge(med_price1, on = ["Rooms"], how = "left" ) 

med_price['Median'] = med_price['Median'].fillna(med_price["Median1"])

med_price = med_price.drop("Median1", axis = 1)

table = table.merge(med_price, on = ["Rooms","Square"], how = "left", ) 

table['mean'] = table['mean'].fillna(table["Median"])

table = table.drop("Median", axis = 1)
cutted = cutted.merge(table, on = ["DistrictId", "Rooms", "Square"], how = "left", )


In [332]:
df = df.merge(cutted[["Id","mean"]] ,on = "Id", how = "left" )

In [333]:
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,mean_price,mean
0,14038,35,2.0,47.981561,29.442751,6.0,7.0,9.0,1969,0.08904,...,33,7976,5,900.0,0,11,0,184966.93073,213160.076297,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7.0,9.0,1978,7e-05,...,46,10309,1,240.0,1,16,0,300009.450063,303643.563728,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8.0,12.0,1968,0.049637,...,34,7759,0,229.0,1,3,0,220925.908524,259938.729121,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8.0,17.0,1977,0.437885,...,23,5735,3,1084.0,0,5,0,175616.227217,172368.882269,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11.0,12.0,1976,0.012339,...,35,5776,1,2078.0,2,4,0,150226.531644,155748.415754,150226.531644


In [334]:
df["mean_price"] = df["mean_price"].fillna(df["mean"])

In [335]:
df = df.drop("mean", axis = 1)

In [336]:
# count by district feature 

count_rooms = df["DistrictId"].value_counts(normalize = True).reset_index().rename(columns = {
                                                     "index":"DistrictId","DistrictId":"Count"})
df = df.merge(count_rooms, on = ["DistrictId"], how = "left")

In [337]:
df = df[ ["Id","DistrictId", "mean_price", "Count"] + df.columns[2:-2].tolist()]

In [338]:
test_learning(df[:10000], 42, 0.2)

LinearRegression ---0.7006053596673205
RandomForestRegressor ---0.745833121068981
Support Vector Regression ---0.7043282334802944


In [339]:
df.head()

Unnamed: 0,Id,DistrictId,mean_price,Count,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,...,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,213160.076297,0.0024,2.0,47.981561,29.442751,6.0,7.0,9.0,...,0,0,33,7976,5,900.0,0,11,0,184966.93073
1,15053,41,303643.563728,0.008733,3.0,65.68364,40.049543,8.0,7.0,9.0,...,0,0,46,10309,1,240.0,1,16,0,300009.450063
2,4765,53,259938.729121,0.017667,2.0,44.947953,29.197612,0.0,8.0,12.0,...,0,0,34,7759,0,229.0,1,3,0,220925.908524
3,5809,58,172368.882269,0.018,2.0,53.352981,52.731512,9.0,8.0,17.0,...,0,0,23,5735,3,1084.0,0,5,0,175616.227217
4,10783,99,155748.415754,0.009267,1.0,39.649192,23.776169,7.0,11.0,12.0,...,0,0,35,5776,1,2078.0,2,4,0,150226.531644


# LEARNING

In [340]:
lr = RandomForestRegressor(random_state=42, n_jobs = -1, max_depth=15, n_estimators=1000)
lr.fit(df.iloc[:10000,:-1], df.iloc[:10000,-1:].values[:,0])
df.iloc[10000:,-1:] = lr.predict(df.iloc[10000:,:-1])

In [341]:
df.loc[10000:,["Id","Price"]].to_csv("SCuznetov_predictions.csv", index = None)