In [1]:
import numpy as np
import pandas as pd
import timeit

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
df_raw = pd.read_csv('data/sberbank-russian-housing-market/train.csv')
df_raw = df_raw.select_dtypes(exclude=['category', 'object'])
df_raw = df_raw.drop(['id'], axis=1)

In [3]:
df_corr = df_raw.corr()

In [4]:
remove_cols = set()
for col1 in df_raw.columns:
    if col1 in remove_cols or col1 == 'price_doc':
        continue
        
    for col2 in df_raw.columns:
        if col1 == col2 or col2 in remove_cols or col2 == 'price_doc':
            continue
            
        if abs(df_corr[col1][col2]) > 0.80:
            remove_cols.add(col2)
            
df = df_raw.drop(list(remove_cols), axis=1)

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
df = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)

In [None]:
prices = df['price_doc']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [None]:
NUM_FEATURES = 50
highest_corrs = abs(df_corr['price_doc']).sort_values(ascending=False)[1:]
features = ['price_doc']
c = 0
for feat in highest_corrs.index:
    if feat in set(df.columns):
        features.append(feat)
        c += 1
        
    if c >= NUM_FEATURES:
        break

In [None]:
df = df[features]
df.head()

In [None]:
df.drop('price_doc',1,inplace=True)

In [None]:
df = pd.concat([df,prices],axis=1)

In [None]:
X = df.drop('price_doc', axis=1)
y = np.sqrt(df['price_doc'])

# X_train = X.iloc[:int(30471 * 0.8)]
# y_train = y.iloc[:int(30471 * 0.8)]
# X_test = X.iloc[int(30471 * 0.8)+1:]
# y_test = y.iloc[int(30471 * 0.8)+1:]

X_train = X.iloc[:int(30471 * 0.5)]
y_train = y.iloc[:int(30471 * 0.5)]
X_test = X.iloc[int(30471 * 0.5)+1:]
y_test = y.iloc[int(30471 * 0.5)+1:]

In [None]:
# Raw price_doc density plot
df['price_doc'].plot.density()

In [None]:
# sqrt(price_doc) density plot
y.plot.density()

In [None]:
start_time = timeit.default_timer()
reg = LinearRegression().fit(X_train,y_train)
y_pred_lin = reg.predict(X_test.drop('ones',1))
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lin))
rmse**2 # back transforming sqrt(price_doc)

In [None]:
reg.score(X_test.drop('ones',1),y_test)