In [None]:
!pip install category_encoders
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import timeit
import xgboost as xgb

from category_encoders import TargetEncoder
from google.colab import files
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LassoLarsCV, LinearRegression, ElasticNet#, SGDRegressor,LogisticRegression, LogisticRegressionCV,
from sklearn.model_selection import train_test_split#, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.tree import DecisionTreeRegressor

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 80)

raw_df_train = pd.read_csv('https://raw.githubusercontent.com/Michael-Bokov/Sber_Data_Science/blob/main/train.csv', index_col=0)
raw_df_test = pd.read_csv('https://raw.githubusercontent.com/Michael-Bokov/Sber_Data_Science/blob/main/test.csv', index_col=0)

#Work with categories

In [None]:
def is_cat(df, col):
  '''checks if a column is of object or category type'''
  return df[col].dtype in ['object', 'category']

def improve_cats(dataframe) -> pd.DataFrame:
  '''turns dtypes 64->32 and object->category'''
  df = dataframe.copy()
  for col in df.columns:
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
      df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
      df[col] = df[col].astype('category')
    else:
      print('Unknown data type')
      return
  return df

#Show statistics

In [None]:
def adjusted_r2(yt,yp, colnum):
  '''computes adjusted r2 score'''
  return 1 - (1 - r2_score(yt, yp)) * ((yt.shape[0]-1) / (yt.shape[0] - colnum+(1e-12)))

def print_scores(y_train, y_test, y_pred_train, y_pred_test, colnum):
  '''outprints adjusted r2 and rmse results on train and validation'''
  train_r2 =  adjusted_r2(y_train, y_pred_train, colnum)
  test_r2 = adjusted_r2(y_test,y_pred_test, colnum)
  train_rmse = np.sqrt(mean_squared_error(np.log(y_pred_train), np.log(y_train)))
  test_rmse = np.sqrt(mean_squared_error(np.log(y_pred_test), np.log(y_test)))
  text = f'train r2: {train_r2}\ntrain rmse: {train_rmse}\n\ntest r2: {test_r2}\ntest rmse: {test_rmse}\n'
  print(text)
  return text

#Split dataframe to X,y

In [None]:
def split_data(df, target):
  '''splits data'''
  X = df.drop(target, axis=1)
  y = df[target]
  return train_test_split(X, y, test_size=0.3, random_state=42)

#Work with model

In [None]:
def train_model(X_train, y_train):
  '''subj'''
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)),
                ('Tree',DecisionTreeRegressor(max_depth=5)), ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]
  pipe = make_pipeline(RobustScaler(), StackingRegressor(estimators=estimators, final_estimator = RandomForestRegressor(max_depth=6)))
  pipe.fit(X_train, y_train)
  y3_train = np.exp((np.log(y_train) + np.log(pipe.predict(X_train))) / 2)
  pipe.fit(X_train, y3_train)



  return pipe

In [None]:
def train_model2(X_train, y_train):
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5))]

  pipe1 = make_pipeline(RobustScaler(), xgb.XGBRegressor(colsample_bytree=0.4,
                             gamma=0.045,
                             learning_rate=0.05,
                             max_depth=10,
                             min_child_weight=1.5,
                             n_estimators=300,
                             reg_alpha=0.65,
                             reg_lambda=0.45,
                             subsample=0.95))
  pipe2 = make_pipeline(RobustScaler(), LassoLarsCV(max_iter=15,eps=0.01))
  pipe3 = make_pipeline(RobustScaler(), DecisionTreeRegressor(max_depth=5))
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)),
                ('Tree',DecisionTreeRegressor(max_depth=5)), ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]
  pipe = make_pipeline(RobustScaler(), StackingRegressor(estimators=estimators, final_estimator = RandomForestRegressor(max_depth=6)))

  blend = VotingRegressor(estimators=[('pipe1', pipe1), ('pipe2', pipe2), ('pipe3', pipe3), ('pipe4',pipe)], weights=[0.2, 0.1, 0.1, 0.6])


  blend.fit(X_train, y_train)
  y3_train = np.exp((np.log(y_train) + np.log(blend.predict(X_train))) / 2)
  blend.fit(X_train, y3_train)

  return blend


In [None]:
def split_run_test(df, target = 'SalePrice'):
  '''splits data and runs model'''
  X_train, X_test, y_train, y_test = split_data(df, target)
  model = train_model2(X_train, y_train)

  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
  txt = print_scores(y_train, y_test, y_pred_train, y_pred_test, len(X_train.columns))
  return model, txt

In [None]:
def get_permutations(df):
  '''Pick best columns'''
  X_train, X_test, y_train, y_test = split_data(df, 'SalePrice')
  model = train_model2(X_train, y_train)
  result = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)
  importance_dict = dict(zip(X_train.columns, result.importances_mean))
  sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
  print(sorted_importance)
  #get_shap(X_train, X_test, model)
  #grid(X_train, y_train)
  return [x[0] for x in sorted_importance]

'''def get_shap(X_train, X_test, model):
  explainer = shap.Explainer(model.predict, X_train)
  shap_values = explainer(X_test)
  shap.plots.waterfall(shap_values[0])

def grid(X_train, y_train):
  estimators = [('LinReg',LinearRegression()),
                ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)),
                ('Tree',DecisionTreeRegressor(max_depth=5)),
                ('LGBR', LGBMRegressor(max_depth=5, learning_rate=0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]

  pipe = make_pipeline(RobustScaler(),
                      StackingRegressor(estimators=estimators,
                                        final_estimator=RandomForestRegressor(max_depth=6)))

  param_grid = {
      'stackingregressor__final_estimator__n_estimators': [10, 50, 100],
      'stackingregressor__final_estimator__max_depth': [3, 5, 7],
      'stackingregressor__final_estimator__min_samples_split': [2, 4, 8],
      'stackingregressor__Elastic__l1_ratio': [0.1, 0.5, 0.9],
  }

  grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
  grid.fit(X_train, y_train)
  print('grid best params: ', grid.best_params_)
  print('grid best score: ', grid.best_score_)'''

"def get_shap(X_train, X_test, model):\n  explainer = shap.Explainer(model.predict, X_train)\n  shap_values = explainer(X_test)\n  shap.plots.waterfall(shap_values[0])\n\ndef grid(X_train, y_train):\n  estimators = [('LinReg',LinearRegression()), \n                ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), \n                ('Tree',DecisionTreeRegressor(max_depth=5)), \n                ('LGBR', LGBMRegressor(max_depth=5, learning_rate=0.05)),\n                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]\n                \n  pipe = make_pipeline(RobustScaler(), \n                      StackingRegressor(estimators=estimators, \n                                        final_estimator=RandomForestRegressor(max_depth=6)))\n\n  param_grid = {\n      'stackingregressor__final_estimator__n_estimators': [10, 50, 100],\n      'stackingregressor__final_estimator__max_depth': [3, 5, 7],\n      'stackingregressor__final_estimator__min_samples_split': [2, 4, 8],\n      'stackingregressor_

#Work with columns

In [None]:
def create_cols(df):
  #create Total SF
  df["TotalSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"]+df['LotArea']
  #create Porch
  df['PorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
  #Create green area
  df["OutsideArea"] = df["LotArea"] - df["GrLivArea"] - df["GarageArea"]
  #Create month sold * year
  df['MonthSold'] = df['YrSold']*12 + df['MoSold'] #-df['YrSold'].min()
  #dates_frames  = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold','MoSold']
  #Create booleans
  df['Has2Floors'] = df['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['Has1Floors'] = df['1stFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasPorch'] = df['PorchSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasWood'] = df['WoodDeckSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x>0 else 0)
  #create bath
  fullbsmtb = df['BsmtFullBath'].apply(lambda x: x if x > 0 else 0)
  halfbsmtb = df['BsmtHalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  fullb = df['FullBath'].apply(lambda x: x if x > 0 else 0)
  halfb = df['HalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  df['Bath'] = fullbsmtb + halfbsmtb + fullb + halfb
  return df

def log_cols(df, log_cols):
  '''adds 1 to all numeric columns and np.logs handpicked columns'''
  for column in [col for col in df.columns if not is_cat(df, col)]:
    df[column] = df[column]+1
  for col in log_cols:
    df[col] = np.log(df[col])
  return df

def plotme(df, cols):
  for col in cols:
    if col != 'SalePrice':
      sns.scatterplot(y = df['SalePrice'], x = df[col])

def categorize_cols(df):
  """fill NaNs"""
  for col in df.columns:
    if df[col].isna().any():
      if is_cat(df, col):
        df[col] = df[col].cat.add_categories(['MISSING'])
        df[col] = df[col].fillna('MISSING')
        df[col] = df[col].cat.remove_unused_categories()
      else:
        if col not in ['GarageArea', 'KitchenAbvGr', 'TotRmsAbvGrd', 'LotArea',\
                       'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces', 'LotFrontage', \
                       'WoodDeckSF', 'MasVnrArea', '2ndFlrSF','GarageArea', 'WoodDeckSF',\
                       'BsmtFinSF1', 'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']:
          df[col] = df[col].fillna(df[col].mean())
        else:
          df[col] = df[col].fillna(0)
  return df

"""def encode_cols(df, cols):
  '''Use Label encoder'''
  for col in cols:
    encoder = LabelEncoder()
    df[col+'_e'] = encoder.fit_transform(df[col])
  return df"""

def drop_categories(df):
  '''drop cat values'''
  cats = [col for col in df.columns if is_cat(df, col)]
  return df.drop(columns = cats)

def iqr(df, columns, mult=3):
  '''cut outliers'''
  df = df.copy()
  for col in columns:
    d=df[col].describe()
    val =(d['50%'] + (d['75%']-d['25%'])) * mult
    df = df[df[col] <= val]
  return df

def work_df(df, to_log = ['LotFrontage', 'LotArea', 'GrLivArea', 'GarageArea'], target =[], to_drop = []):
  df = df.copy()
  df = improve_cats(df)
  df = create_cols(df)
  df = df.drop(columns = to_drop)
  df = categorize_cols(df)

  df['OutsideArea'] = df['OutsideArea'].apply(lambda x: x if x>0 else 0)

  df = log_cols(df, target+to_log)
  return df

#Creating variable

In [None]:
cols_to_log = ['LotFrontage', 'LotArea', 'GrLivArea','TotalSF','OutsideArea',
               'MonthSold', 'TotalBsmtSF', 'YrSold', 'YearBuilt','YearRemodAdd',
               'GarageYrBlt', '2ndFlrSF','GarageArea', 'WoodDeckSF', 'BsmtFinSF1',
               'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']

cols_to_iqr = ['SalePrice','LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF',
               'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces',
               'GarageArea', 'PorchSF', 'OutsideArea', 'TotalSF', '2ndFlrSF', '1stFlrSF']

cols_to_drop = ['OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch',
                'PoolArea', 'MiscVal', 'GarageCars', 'BsmtFullBath', 'BsmtHalfBath',
                'FullBath', 'HalfBath']

In [None]:
df = work_df(raw_df_train, to_log = cols_to_log, to_drop = cols_to_drop, target = ['SalePrice'])
df_test = work_df(raw_df_test, to_log = cols_to_log, to_drop = cols_to_drop)

df = iqr(df, cols_to_iqr)

cols = ['Neighborhood']#, 'LotShape']#, 'FireplaceQu']
for col in cols:
  encoder = TargetEncoder(cols = col)
  df[col+'_te'] = encoder.fit_transform(df[col], df['SalePrice'])
  df_test[col+'_te'] = encoder.transform(df_test[col])

df = drop_categories(df)
df_test = drop_categories(df_test)

#Getting best columns

In [None]:
#to_drop = []#'Modern', 'HasBsmt', 'HasGarage', 'HasPool','HasVnr']
df1 = df.copy()           #.drop(columns = to_drop)
df_test1 = df_test.copy() #.drop(columns = to_drop)
columns = get_permutations(df1)

[('MonthSold', 0.2559863085101183), ('MoSold', 0.23088428628186336), ('YrSold', 0.2102053476054362), ('OverallQual', 0.17855402935447007), ('GrLivArea', 0.13369949603953984), ('Neighborhood_te', 0.0816968371576701), ('Has2Floors', 0.042743246712187065), ('TotalSF', 0.041153367508094806), ('2ndFlrSF', 0.039778165011898695), ('YearBuilt', 0.02652715509832848), ('OverallCond', 0.025114403839854416), ('Bath', 0.02389970850142048), ('1stFlrSF', 0.0200721810723009), ('BsmtFinSF1', 0.012399168041773479), ('TotalBsmtSF', 0.011893362412140052), ('GarageArea', 0.01162788615021696), ('LotArea', 0.010280523048528223), ('YearRemodAdd', 0.009124523526379213), ('WoodDeckSF', 0.005614155837502155), ('PorchSF', 0.004838893914205833), ('Fireplaces', 0.003910936393791753), ('GarageYrBlt', 0.0025357128390656293), ('MSSubClass', 0.001980483197334404), ('BedroomAbvGr', 0.001625011773261542), ('OutsideArea', 0.0015274198232802737), ('HasWood', 0.001153836437764022), ('KitchenAbvGr', 0.0009467576900471398), (

In [None]:
cols = ['SalePrice']+columns
model, txt = split_run_test(df1[cols])

train r2: 0.9201655824226073
train rmse: 0.008856756000216827

test r2: 0.8852497170287503
test rmse: 0.010478333335842714



In [None]:
y_pred_test = model.predict(df_test[columns])

In [None]:

df_test_result = df_test.copy()
df_test_result['SalePrice'] = y_pred_test
#pd.concat([df_test[columns], pd.DataFrame({'SalePrice':y_pred_test})], axis=1)
new_df = pd.concat([df1, df_test_result.sample(frac=0.6, random_state=42)])
new_df = new_df.dropna()
new_df.reset_index(drop=True)
columns = get_permutations(new_df)




[('OverallQual', 0.21633836129820394), ('MonthSold', 0.1802484789017727), ('YrSold', 0.15529974524957618), ('MoSold', 0.15275694648356444), ('GrLivArea', 0.09752758306380607), ('Neighborhood_te', 0.06139514137178821), ('Bath', 0.03186661237343259), ('1stFlrSF', 0.025320247987242184), ('TotalSF', 0.022142236283274976), ('2ndFlrSF', 0.019286759690202927), ('Has2Floors', 0.018583098079942562), ('YearBuilt', 0.017806293419251475), ('OverallCond', 0.015841797850137885), ('TotalBsmtSF', 0.011485639396208946), ('GarageArea', 0.011443266486685765), ('YearRemodAdd', 0.00967645738591626), ('BsmtFinSF1', 0.008357597623870849), ('PorchSF', 0.002710187819691778), ('WoodDeckSF', 0.002709309013777317), ('Fireplaces', 0.002490679527070383), ('TotRmsAbvGrd', 0.0016940670890067343), ('BedroomAbvGr', 0.0013833940152164436), ('MSSubClass', 0.0011952509256802247), ('HasFireplace', 0.0011559039031330044), ('BsmtUnfSF', 0.0010451099946064968), ('KitchenAbvGr', 0.000744599438558402), ('LotArea', 0.00074135262

In [None]:
model, txt = split_run_test(new_df[['SalePrice']+columns])
y_pred_test = model.predict(df_test[columns])


train r2: 0.9457550549961163
train rmse: 0.0072676115466568155

test r2: 0.9214724849541059
test rmse: 0.008654206641508967



In [None]:
df_test.shape

(1459, 36)

In [None]:
ids = np.arange(1461, 2920)
my_result = pd.DataFrame({'Id': ids, 'SalePrice': np.e**y_pred_test})
my_result

Unnamed: 0,Id,SalePrice
0,1461,122360.735480
1,1462,155519.882292
2,1463,175415.946335
3,1464,183187.224109
4,1465,194958.470326
...,...,...
1454,2915,85673.636691
1455,2916,91683.421150
1456,2917,162971.177882
1457,2918,121881.759491


In [None]:
my_result.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Work with FTP

In [None]:
def connect(address='0.0.0.0', name = 'anonymous', pas = ''):
	ftp = ftplib.FTP(address)
	ftp.login(name, pas)
	return ftp

def ret(ftp, filename = 'test.csv'):
	file = io.BytesIO()
	ftp.retrbinary('RETR '+filename, file.write)
	file.seek(0)
	df = pd.read_csv(file)
	return df

def write(df, ftp, filename = 'test.csv'):
  file = io.BytesIO()
  df.to_csv(file, index = False)
  data = file.getvalue()
  ftp.storbinary('STOR '+filename, io.BytesIO(data))
  ftp.quit()
