### Importing libraries

In [None]:
from __future__ import annotations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

### Ingesting Data

In [None]:
train = pd.read_csv('datasets/train.csv', index_col='Id', keep_default_na=False,
                                                       # na_values=
                                                        )
test =  pd.read_csv('datasets/test.csv', index_col='Id', keep_default_na=False,
                                                        #na_values=
                                                        )

train = pd.concat([train, test])

train.head()

### Changing Types

In [None]:
def clean_data(train):
    # Replace all instances of "NA" with "0" in column: 'LotFrontage'
    train['LotFrontage'] = train['LotFrontage'].str.replace("NA", "0", case=False, regex=False)
    # Change column type to float64 for column: 'LotFrontage'
    train = train.astype({'LotFrontage': 'float64'})
    # Change column type to category for column: 'MSSubClass'
    # train = train.astype({'MSSubClass': 'category'})
    # Change column type to category for column: 'MSZoning'
    # train = train.astype({'MSZoning': 'category'})
    # Change column type to category for column: 'Street'
    # train = train.astype({'Street': 'category'})
    # Change column type to category for columns: 'Alley', 'LotShape' and 14 other columns
    # train = train.astype({'Alley': 'category', 'LotShape': 'category', 'LandContour': 'category', 'Utilities': 'category', 'LotConfig': 'category', 'LandSlope': 'category', 'Neighborhood': 'category', 'Condition1': 'category', 'Condition2': 'category', 'BldgType': 'category', 'HouseStyle': 'category', 'RoofStyle': 'category', 'RoofMatl': 'category', 'Exterior1st': 'category', 'MasVnrType': 'category', 'Exterior2nd': 'category'})
    # Replace all instances of 0 with -0.1 in column: 'LotFrontage'
    train.loc[train['LotFrontage'] == 0, 'LotFrontage'] = np.nan
    # Replace all instances of "na" with "0" in column: 'MasVnrArea'
    train['MasVnrArea'] = train['MasVnrArea'].str.replace("na", "0", case=False, regex=False)
    # Change column type to float64 for column: 'MasVnrArea'
    train = train.astype({'MasVnrArea': 'float64'})
    # Change column type to category for columns: 'ExterQual', 'ExterCond' and 21 other columns
    # train = train.astype({'ExterQual': 'category', 'ExterCond': 'category', 'Foundation': 'category', 'BsmtQual': 'category', 'BsmtCond': 'category', 'BsmtExposure': 'category', 'BsmtFinType1': 'category', 'BsmtFinType2': 'category', 'Heating': 'category', 'HeatingQC': 'category', 'CentralAir': 'category', 'Electrical': 'category', 'KitchenQual': 'category', 'Functional': 'category', 'FireplaceQu': 'category', 'GarageType': 'category', 'GarageFinish': 'category', 'GarageQual': 'category', 'GarageCond': 'category', 'PavedDrive': 'category', 'PoolQC': 'category', 'Fence': 'category', 'MiscFeature': 'category'})
    # Change column type to category for columns: 'SaleType', 'SaleCondition'
    # train = train.astype({'SaleType': 'category', 'SaleCondition': 'category','GarageYrBlt': 'category',
                        #   'YearRemodAdd': 'category','YearBuilt': 'category','YrSold': 'category',
                        #   'OverallQual': 'category','OverallCond': 'category','MoSold': 'category'})
    return train

train_clean = clean_data(train.copy())
train_clean.head()

In [None]:
test_clean = clean_data(test.copy())
test_clean.head()

### Ingesting Description Data

In [None]:
data_description = pd.read_csv('datasets/data_description.txt',
                                sep=r'[\t:]',
                                na_filter=False,
                                header=None,
                                names = ['variable','description'],
                                # na_values= ['UNK'],
                                on_bad_lines='skip',
                                skip_blank_lines=False,
                                skipinitialspace=False,
                                # iterator=True,
                                # chunksize=1
                                

)
data_description.drop(axis='rows', index=373, inplace=True )


In [None]:
data_description 

In [None]:
breaks = data_description.query('description.isna()').index.to_list()
vars = []

for brk in breaks:
    vars.append(data_description.at[brk+1, 'variable'] )

vars.insert(0, data_description.at[0, 'variable'])
data_description.insert(1, 'category', 'see')

In [None]:
i=0
x=0
for brk in breaks:
    data_description.loc[i: brk, 'category'] = vars[x]
    i=breaks[x]+1
    x+=1
    if brk==462:
        data_description.loc[brk:, 'category'] = vars[x]
        break

data_description.loc[data_description['category'] == "see", 'category'] = "SaleCondition"

data_description.drop(breaks, axis=0, inplace=True)
data_description


In [None]:
desc = data_description.query('variable == category').index
data_description.drop(desc, inplace=True  )


In [None]:
na = list(data_description.query("variable.str.strip() == 'NA' ").category.values)
na


In [None]:
n_na= list(set(train_clean.columns.to_list()).difference(set(na)))
n_na

## Find and Set Missing

In [None]:

for col in n_na:
    if train_clean[col].eq('NA').sum() > 0:
        print(col)
        train_clean.loc[train_clean[col] == 'NA', col] = None
    # else:




In [None]:
train_clean

## Missing Data

In [None]:
def check_missing(df=train_clean):
    missing = df.isna().sum()
    missing_data = missing[missing > 0]
    return missing_data

check_missing()

In [None]:
msno.matrix(train_clean)

In [None]:
msno.heatmap(train_clean)

# Fill Missing Data

### YearBuilt

In [None]:
corr = train_clean[["GarageYrBlt", "YearBuilt"]].corr()
corr

In [None]:
train_clean["GarageYrBlt"].fillna(train_clean["YearBuilt"],inplace=True)
test_clean["GarageYrBlt"].fillna(test_clean["YearBuilt"],inplace=True)


## LotFrontage

In [None]:
check_missing()

In [None]:
sns.catplot(data=train_clean, x='LotFrontage',kind='strip', row='LotConfig', sharex=False,height=3,orient='portrait')

plt.show()


In [None]:
lot_front_dic = train_clean.mask(train_clean.LotFrontage.isna()) \
            .groupby('LotConfig') \
                ['LotFrontage'].median().round() \
                .to_dict()

lot_front_dic

In [None]:
# missing_LF = train_clean[(train_clean.LotConfig == key) & (train_clean.LotFrontage.isna())]  

for key, item in lot_front_dic.items():
    train_clean.loc[(train_clean.LotConfig==key ) & (train_clean.LotFrontage.isna()), 'LotFrontage'] = item
    test_clean.loc[(test_clean.LotConfig==key ) & (test_clean.LotFrontage.isna()), 'LotFrontage'] = item

In [None]:
train_clean.MasVnrType.fillna('None', inplace=True)

In [None]:
train_clean.Electrical.fillna(method='pad', inplace=True)

In [None]:
check_missing(train_clean)

# Convert Numeric Strings to Floats

In [None]:
for col in train_clean.select_dtypes('object').columns:
    if train_clean[col].str.isnumeric().sum() > 10:
        print(col)
        train_clean[col] = train_clean[col].astype('float64')

# Drop Missing

### Drop Columns Dominated by a certain value

In [None]:
# unbalanced_cols = []
# for col in train_clean.select_dtypes('object').columns:
#     value_counts = train_clean[col].value_counts(normalize=True)

#     max = value_counts.max()
    
#     print(value_counts)
#    # print(max)

#     if max > .70:
#         unbalanced_cols.append(col)


In [None]:
# train_clean.drop(unbalanced_cols, axis='columns',inplace=True)
# check_missing()

In [None]:
train_clean.SalePrice.fillna(0, inplace=True)
train_clean.dropna(axis=0,inplace=True)
print(train_clean.shape)
check_missing()

# Feature Importance

## Correlation of Predictors with Target Variables

In [None]:
# def plot_correlation_heatmap(target_col: str or None):

#     corr_data: pd.DataFrame
    
#     if target_col is None:
corr_price = train_clean.corr(method='pearson', numeric_only=True)['SalePrice'].sort_values(ascending=False)
#     elif isinstance(target_col, str):
#         corr_data = train_clean.corr(method='pearson')

sns.heatmap(corr_price.to_frame())
plt.show()

# plot_correlation_heatmap('SalePrice')

## Correlation between Predictor Variables (Multicollinearity)

In [None]:
corr_df = train_clean.corr(method='pearson', numeric_only=True).sort_values(by='SalePrice')
fig = plt.figure(figsize=(10,12))
sns.heatmap(corr_df,cmap='BrBG', figure=fig )
plt.show()

In [None]:
corr_df.index


## Drop highly correlated independent variables

In [None]:
# corrTol = 0.65

# for col in corr_df:
#     if col in corr_df.keys():
#         thisCol = []
#         thisVars = []

#         for i in range(len(corr_df)):
#             if abs(corr_df[col][i]) == 1.0 and col != corr_df.keys()[i]:
#                 thisCorr = 0
#             else:
#                 thisCorr = ( 1 if abs(corr_df[col][i]) > corrTol else -1)
            
#             thisCol.append(thisCorr)
#             thisVars.append(corr_df.keys()[i])
        
#         mask = np.ones(len(thisCol), dtype=bool)

#         ctDelCol = 0

#         for n, j in enumerate(thisCol):
#             # is the correlation greater than 0not equal to the max corr and greater than ze
#             mask[n] = not (j != max(thisCol) and j>=0)

#             if j != max(thisCol) and j >= 0:
#                 corr_df.pop('%s' %thisVars[n])
#                 train_clean.pop('%s' %thisVars[n])
#                 ctDelCol += 1

#         corr_df = corr_df[mask]

In [None]:
# cols_del = []
# tol = .65

# for col, row in corr_df.iterrows():
#     # print(col)
#     # print(row)
#     # print(f"Current column {col}")
#     for col2, corr in row.items():
#         # print(f"Checking correlation with {col2}")
#         if abs(corr) > tol and col2 != col:
#             # print(f"Correlation of {corr} is greater than the tolerance of {tol}")
#             # print("Adding it to deleted columns")
#             corr_target = corr_df.loc[col, 'SalePrice']
#             corr_target2 = corr_df.loc[col2, 'SalePrice']
#             if corr_target > corr_target2:
#                 cols_del.append(col2)
#             else:
#                 cols_del.append(col)

# cols_del = list(set(cols_del))


In [None]:
# cols_del

In [None]:
# train_clean.drop(columns=cols_del, inplace=True)
# corr_df.drop(columns=cols_del, inplace=True)
# train_clean


In [None]:
# import pingouin
# cats = train_clean.select_dtypes('object').columns.to_list()

# frames = []
# for col in range(len(cats)):
#     frame = pingouin.welch_anova(data=train_clean, dv='SalePrice', between=cats[col]).round(3)
#     frames.append(frame)

In [None]:
# c = pd.concat(frames)
# c.loc[c["p-unc"] > 0.05]

In [None]:
# frames = []
# for col in range(len(cats)):
#     frame = pingouin.normality(data=train_clean, dv='SalePrice', group=cats[col]).round(3)
#     frames.append(frame)

In [None]:
# norm = pd.concat(frames)
# norm

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_classif



# for col in cats:
    
# X_new = SelectKBest(f_classif, k=15).fit_transform(train_X, train_y)
# X_new.shape


# Binarize, Convert to Ordinal

In [None]:


ordinal = ['LotShape','LandSlope', 'ExterQual','ExterCond','BsmtQual','BsmtCond',
            'BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC',
            'KitchenQual','Functional','FireplaceQu','GarageFinish',
            'GarageQual','GarageCond','PavedDrive','PoolQC',
            'Fence']

special = ['OverallCond','OverallQual','MSSubClass']

time_col = ['YearBuilt','YearRemodAdd','YrSold','GarageYrBlt','MoSold']

floats = [col for col in train_clean.select_dtypes('number').columns.to_list()]


In [None]:

def binarize(train_clean):
    
    for col in train_clean.columns:
        if col not in ordinal and col not in time_col and col not in floats and col not in special:
            train_clean = pd.get_dummies(train_clean, columns=[col], drop_first=True)
    return train_clean


In [None]:

train_clean = binarize(train_clean)
train_clean.head()

In [None]:
all_ordinal = time_col + ordinal
set(all_ordinal).difference(set(ordinal))

In [None]:
order = dict ()

for col in all_ordinal:
    x = data_description.loc[data_description['category']== col, 'variable'].to_list()
    order[col] = x

print(order)
                     

In [None]:
def factorize_ordinals(df):

    for col, categories in order.items():
        n = len(col)
        codes = np.zeros(n)
        if len(categories) != 0:
            # df[col], _ = pd.Categorical(df[col])
            df[col] = pd.Categorical(df[col], categories=categories, ordered=True)
            df[col] = df[col].cat.codes
        # else:
    return df 

train_clean = factorize_ordinals(train_clean)



In [None]:

train_clean

In [None]:
test_clean = binarize(test_clean)
test_clean = factorize_ordinals(test_clean)

# Modeling

## Imports

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression, SelectKBest, RFECV
import xgboost



## Preprocessing Pipeline

In [None]:
X_train = train_clean.loc[train_clean['SalePrice'] != 0].drop(columns='SalePrice')
y_train = train_clean.loc[train_clean['SalePrice'] != 0, 'SalePrice'] 
X_test = train_clean.loc[train_clean['SalePrice'] == 0].drop(columns='SalePrice')
# y_test = 

In [None]:
train_clean.to_csv('datasets/train_clean.csv')
test_clean.to_csv('datasets/test_clean.csv')

## Baseline Model (Linear Regression)    

In [None]:
linear_reg = LinearRegression()

linear_reg.fit(X_train, y_train)

In [None]:
def submit (y_pred):
    i = pd.Index(name='Id', data= range(len(y_pred)))
    s = pd.Series(data=y_pred, dtype='float64', index=i, name='SalePrice')

    return s.to_frame()

In [None]:
y_pred = linear_reg.predict(X_test)

In [None]:
a = submit(y_pred)
a

## Baseline Model (Decision Tree)

In [None]:
dt = DecisionTreeRegressor(criterion='squared_error')

dt.fit(X_train, y_train)

In [None]:
y_pred_dt = dt.predict(X_test)
b= submit(y_pred_dt)

In [None]:
plt.plot(dt.feature_importances_)

In [None]:
rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)

In [None]:
y_pred_rfr = rfr.predict(X_test)

c = submit(y_pred_rfr)


In [None]:
def combine_submissions (frames: list[pd.DataFrame], models):

    keys = [model.__class__.__name__ for model in models ]

    # try:
    return pd.concat(frames, axis=1, join='inner', keys=keys)
    # except Exception:
        
        # print(Exception)

In [None]:
def get_best(tests, k, X_train, y_train):
    dfs = []
    # scaler = StandardScaler()

    for test in tests:
        best = SelectKBest(test, k=k).fit(X_train, y_train)

        scores = sorted(best.scores_, reverse=True)[:10]

        # scores = scaler.fit_transform(np.array(scores).reshape(-1, 1)).reshape(10)

        df = pd.DataFrame({'variable': best.get_feature_names_out(),
                'score':scores} )
        
        dfs.append(df)
    
    return pd.concat(dfs)

 

In [None]:
tests = [r_regression, f_regression, mutual_info_regression]
k=10
best = get_best(tests, k, X_train, y_train) 

In [None]:
best.sort_values(by='score',ascending=False)

In [None]:
best.variable.value_counts(ascending=False).index[:10].to_list()

In [None]:
gbrt = GradientBoostingRegressor(max_depth=5, n_estimators=10, learning_rate=1.0)
gbrt.fit(X_train, y_train)

In [None]:
y_pred_dbrt = gbrt.predict(X_test)
d = submit(y_pred_dbrt)

In [None]:
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)

In [None]:
y_pred_ada = ada.predict(X_test)

In [None]:
e = submit(y_pred_ada)

In [None]:
xgb = xgboost.XGBRegressor()
xgb.fit(X_train, y_train)


In [None]:
y_pred_xgb = xgb.predict(X_test)

In [None]:
f = submit(y_pred_xgb)

In [None]:
combine_submissions([a,b,c,d,e,f], models=[linear_reg,dt,rfr,gbrt,ada,xgb])