What do I want here? I want everything to be in functions and like "main" part, that will trigger all functions above. So you can simply turn on/off any function to look at the results and also you can make changes inside functions without influencing all the code b'z it is incide the function and kinda isolated.

# Import section

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer

from scipy.stats import skew
from scipy.special import boxcox1p

import xgboost
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso

# Functions section

In [73]:
def init_train_test(train_csv_path='train.csv', test_csv_path='test.csv'):
    df_train = pd.read_csv(train_csv_path)
    df_test = pd.read_csv(test_csv_path)
    return df_train, df_test

def transforming_train_by_hand(df_train, perform_target_transf, exclude_anomalies):
    # in this exact case we have two houses with really huge living area and they have relatively small sale price. We need to exclude them as they are clearly outliers
    if exclude_anomalies == True:
        df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)
    if perform_target_transf == True:
        df_train['SalePrice'] = np.log1p(df_train["SalePrice"])
    return df_train

def get_combined_data(df_train, df_test, target_col_name):
    combined_data = pd.concat([df_train.drop(columns=target_col_name), df_test])
    return combined_data

def get_num_obj_col_names(df):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        num_colnames = df.select_dtypes(include=numerics).columns
        obj_colnames = [x for x in df.columns if x not in num_colnames]
        return num_colnames, obj_colnames

def handl_mis_vals(combined_data, misval_approach, misval_addit_values_dict):
    if misval_approach == 'kaggle_by_hand':
        # took it from here
        # https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
        # starting from In [14]:
        combined_data["PoolQC"] = combined_data["PoolQC"].fillna("None")
        combined_data["MiscFeature"] = combined_data["MiscFeature"].fillna("None")
        combined_data["Alley"] = combined_data["Alley"].fillna("None")
        combined_data["Fence"] = combined_data["Fence"].fillna("None")
        combined_data["FireplaceQu"] = combined_data["FireplaceQu"].fillna("None")
        # interesting way to give missing values mean values of the Neighborhood
        combined_data["LotFrontage"] = combined_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
        for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
            combined_data[col] = combined_data[col].fillna('None')
        for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
            combined_data[col] = combined_data[col].fillna(0)
        for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
            combined_data[col] = combined_data[col].fillna(0)
        for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
            combined_data[col] = combined_data[col].fillna('None')
        combined_data["MasVnrType"] = combined_data["MasVnrType"].fillna("None")
        combined_data["MasVnrArea"] = combined_data["MasVnrArea"].fillna(0)
        combined_data['MSZoning'] = combined_data['MSZoning'].fillna(combined_data['MSZoning'].mode()[0])
        combined_data = combined_data.drop(['Utilities'], axis=1)
        combined_data["Functional"] = combined_data["Functional"].fillna("Typ")
        combined_data['Electrical'] = combined_data['Electrical'].fillna(combined_data['Electrical'].mode()[0])
        combined_data['KitchenQual'] = combined_data['KitchenQual'].fillna(combined_data['KitchenQual'].mode()[0])
        combined_data['Exterior1st'] = combined_data['Exterior1st'].fillna(combined_data['Exterior1st'].mode()[0])
        combined_data['Exterior2nd'] = combined_data['Exterior2nd'].fillna(combined_data['Exterior2nd'].mode()[0])
        combined_data['SaleType'] = combined_data['SaleType'].fillna(combined_data['SaleType'].mode()[0])
        combined_data['MSSubClass'] = combined_data['MSSubClass'].fillna("None")

    elif misval_approach == 'threshold_elim':
        mis_prc_threshold = misval_addit_values_dict['mis_prc_threshold']

        mis_data_df = combined_data.isnull().sum(axis=0).sort_values(ascending=False).reset_index()
        mis_data_df.columns = ['col_name', 'cnt_missing_vals']
        mis_data_df['mis_perc'] = mis_data_df['cnt_missing_vals'] / combined_data.shape[0]

        # columns to keep due to threshold
        cols_to_keep = list(mis_data_df[mis_data_df['mis_perc'] < (mis_prc_threshold/100)]['col_name'])
        mis_data_threshold_df = mis_data_df[mis_data_df['col_name'].isin(cols_to_keep)]

        # columns to keep due to threshold were we can still find missing values 
        cols_to_keep_with_missvals = mis_data_threshold_df[mis_data_threshold_df['mis_perc'] != 0]
        combined_data = combined_data[cols_to_keep].copy()  
        
        num_colnames, obj_colnames = get_num_obj_col_names(combined_data)

        # obj_imputer = SimpleImputer(strategy='most_frequent')
        # num_imputer = SimpleImputer(strategy='median')
        num_imputer = SimpleImputer(strategy=misval_addit_values_dict['num_imp_strat'])
        obj_imputer = SimpleImputer(strategy=misval_addit_values_dict['obj_imp_strat'])


        combined_data_num_imp = pd.DataFrame(num_imputer.fit_transform(combined_data[num_colnames]), columns=combined_data[num_colnames].columns)
        combined_data_obj_imp = pd.DataFrame(obj_imputer.fit_transform(combined_data[obj_colnames]), columns=combined_data[obj_colnames].columns)

        combined_data = pd.concat([combined_data_num_imp, combined_data_obj_imp], axis=1)

    return combined_data

def data_add_transf(combined_data, add_transf_method):
    if add_transf_method == 'kaggle_by_hand':
        combined_data['MSSubClass'] = combined_data['MSSubClass'].apply(str)
        combined_data['OverallCond'] = combined_data['OverallCond'].astype(str)
        combined_data['YrSold'] = combined_data['YrSold'].astype(str)
        combined_data['MoSold'] = combined_data['MoSold'].astype(str)

        combined_data['TotalSF'] = combined_data['TotalBsmtSF'] + combined_data['1stFlrSF'] + combined_data['2ndFlrSF']

    return combined_data

def transform_skewed_feats(combined_data, skew_threshold):
    num_colnames, obj_colnames = get_num_obj_col_names(combined_data)
    # Check the skew of all numerical features
    skewed_feats = combined_data[num_colnames].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewed_feats})

    skewness = skewness[abs(skewness) > skew_threshold]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

    skewed_features = skewness.index
    lam = 0.15
    for feat in [x for x in skewed_features if x != 'Id']:
        combined_data[feat] = boxcox1p(combined_data[feat], lam)

    return combined_data

def get_train_test_dfs_from_combined_data(df_train, target_col_name, combined_data):
    new_df_train = combined_data[:df_train.shape[0]].copy()
    new_df_test = combined_data[df_train.shape[0]:].copy()
    new_df_train[target_col_name] = df_train[target_col_name]

    return new_df_train, new_df_test

def elim_cor_values(df_train, cor_threshold, target_col_name, combined_data):
    num_colnames, obj_colnames = get_num_obj_col_names(combined_data)
    num_and_targ_colnames = list(num_colnames)
    num_and_targ_colnames.append(target_col_name)

    new_df_train, new_df_test = get_train_test_dfs_from_combined_data(df_train, target_col_name, combined_data)

    corr_matrix = new_df_train[num_and_targ_colnames].corr()

    corr_matrix = corr_matrix.abs()

    high_corr_var = np.where(corr_matrix > cor_threshold)
    high_corr_var = [(corr_matrix.columns[x], corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]

    print('There are', len(high_corr_var), 'highly corelated pairs of values')


    # fig, ax = plt.subplots(figsize=(20,13))         # Sample figsize in inches
    # sns.heatmap(corr_matrix)
    # plt.show()

    corr_with_target = new_df_train.corr().abs()[target_col_name]
    elements_to_exclude = []


    for corr_pair in high_corr_var:
        el_1 = corr_pair[0]
        el_2 = corr_pair[1]
        if el_1 in elements_to_exclude or el_2 in elements_to_exclude:
            continue
        else:
            if corr_with_target[el_1] > corr_with_target[el_2]:
                # print('el_1', el_1, 'el_2', el_2, 'corr_with_target[el_1]', corr_with_target[el_1], 'corr_with_target[el_2]', corr_with_target[el_2], 'удаляем ' + str(el_2))
                elements_to_exclude.append(el_2)
            else:
                # print('el_1', el_1, 'el_2', el_2, 'corr_with_target[el_1]', corr_with_target[el_1], 'corr_with_target[el_2]', corr_with_target[el_2], 'удаляем ' + str(el_1))
                elements_to_exclude.append(el_1)

    print(len(elements_to_exclude), 'values will be excluded')
    combined_data_return = combined_data.drop(columns=elements_to_exclude)
    return combined_data_return

def scale_combined_data(combined_data):
    num_colnames, obj_colnames = get_num_obj_col_names(combined_data)
    # print(num_colnames)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(combined_data[num_colnames])
    combined_data_scaled_num_colnames = pd.DataFrame(scaled_features, index=combined_data[num_colnames].index, columns=combined_data[num_colnames].columns)
    combined_data_1 = pd.concat([combined_data[obj_colnames], combined_data_scaled_num_colnames], axis = 0)
    return combined_data_1

def back_to_train_test(combined_data, df_train, id_field_col_name, target_col_name):
    comb_data_with_targ = pd.merge(combined_data, df_train[[id_field_col_name, target_col_name]], how='left', on=id_field_col_name)
    df_transf_train = comb_data_with_targ[comb_data_with_targ[target_col_name].notnull()].copy()
    df_transf_test = comb_data_with_targ[comb_data_with_targ[target_col_name].isnull()].copy().drop(columns=[target_col_name])
    return df_transf_train, df_transf_test








# Init section

In [74]:
target_col_name='SalePrice'
useles_col_names = ['Id']
id_field_col_name = 'Id'

perform_target_transf = True
exclude_anomalies = True
misval_addit_values_dict = {}
# misval_approach = 'kaggle_by_hand'
misval_approach = 'threshold_elim'

if misval_approach == 'threshold_elim':
    # columns with missing values ratio more than that threshold will be dropped
    misval_addit_values_dict['mis_prc_threshold'] = 10
    misval_addit_values_dict['num_imp_strat'] = 'median'
    misval_addit_values_dict['obj_imp_strat'] = 'most_frequent'

add_transf_method = 'kaggle_by_hand'
# add_transf_method = 'none'

skew_threshold = 0.75
cor_threshold = 0.6

# Main code section

## Data transformation section

In [75]:
df_train, df_test = init_train_test()
df_train = transforming_train_by_hand(df_train, perform_target_transf, exclude_anomalies)
combined_data = get_combined_data(df_train, df_test, target_col_name)
# after this step it could be different amt of cols in combined data
combined_data = handl_mis_vals(combined_data, misval_approach, misval_addit_values_dict)
combined_data = data_add_transf(combined_data, add_transf_method)
# let's remember which cols was num and which cols was object. 
# We will need this to perform some further calculations on num cols only.

combined_data = pd.get_dummies(combined_data)
combined_data = transform_skewed_feats(combined_data, skew_threshold)
combined_data = elim_cor_values(df_train, cor_threshold, target_col_name, combined_data)
num_colnames, obj_colnames = get_num_obj_col_names(combined_data)
combined_data = scale_combined_data(combined_data)
df_train_transf, df_test_transf = back_to_train_test(combined_data, df_train, id_field_col_name, target_col_name)

There are 33 skewed numerical features to Box Cox transform
There are 14 highly corelated pairs of values
11 values will be excluded
      MoSold_1.0  MoSold_10.0  MoSold_11.0  MoSold_12.0  MoSold_2.0  \
0            0.0          0.0          0.0          0.0         1.0   
1            0.0          0.0          0.0          0.0         0.0   
2            0.0          0.0          0.0          0.0         0.0   
3            0.0          0.0          0.0          0.0         1.0   
4            0.0          0.0          0.0          1.0         0.0   
...          ...          ...          ...          ...         ...   
5829         NaN          NaN          NaN          NaN         NaN   
5830         NaN          NaN          NaN          NaN         NaN   
5831         NaN          NaN          NaN          NaN         NaN   
5832         NaN          NaN          NaN          NaN         NaN   
5833         NaN          NaN          NaN          NaN         NaN   

      MoSold_3



In [72]:
df_test_transf

Unnamed: 0,MoSold_1.0,MoSold_10.0,MoSold_11.0,MoSold_12.0,MoSold_2.0,MoSold_3.0,MoSold_4.0,MoSold_5.0,MoSold_6.0,MoSold_7.0,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,LotArea,YearBuilt,YearRemodAdd,LowQualFinSF,TotalSF
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5829,,,,,,,,,,,...,-0.424069,-0.112352,-0.308318,-0.063869,-0.185835,-2.747168,-0.035834,-0.678305,-0.116024,-1.260545
5830,,,,,,,,,,,...,-0.424069,-0.112352,-0.308318,-0.063869,-0.185835,-2.781774,-0.035834,-0.678305,-0.116024,-1.260545
5831,,,,,,,,,,,...,-0.424069,-0.112352,-0.308318,-0.063869,-0.185835,1.673353,-0.365614,0.564501,-0.116024,0.004721
5832,,,,,,,,,,,...,-0.424069,-0.112352,-0.308318,-0.063869,4.868942,0.283530,0.684706,0.374198,-0.116024,-0.831894


## Feature selection section

In [None]:
# here we can use different feature selection methods
# first to come to my mind is lasso method (need to read again how to do it)
# second thing is to train some model and take top n most important features
# it is always better to have more methods, so I can find more later
col_names = 

## Model section

In [None]:
# here our goal is to build our model both neither overfitted nor underfitted
# we need to use cros validation, different models, hyper parameter optimization and other methods to find best solution to our case
