### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

### Ingesting Data

In [None]:
train = pd.read_csv('datasets/train.csv', index_col='Id', keep_default_na=False,
                                                       # na_values=
                                                        )
test =  pd.read_csv('datasets/test.csv', index_col='Id', keep_default_na=False,
                                                        #na_values=
                                                        )
#data = pd.concat([train, test])
train.head()

In [None]:
test.head()

### Data Exploration

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.select_dtypes('object').astype('string')
obj_cols  = train.select_dtypes('object').astype('string').columns.to_list()
repl = []
for col in obj_cols:
    if train[col].str.isnumeric().sum() > 100:
        print(col) 
        repl.append(col)

### Change to float

In [None]:
for col in repl:
    train[col].replace('NA', np.nan, inplace=True)
    test[col].replace('NA', np.nan, inplace=True)
    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)
    


In [None]:
train.info()

### Variable Distribution

In [None]:
train.columns

In [None]:
# grid = sns.FacetGrid(train)
# grid.map_dataframe(sns.histplot)
# plt.savefig('histograms.jpg')

## Missing Data

In [None]:
def check_missing(df=train):
    missing = df.isna().sum()
    missing_data = missing[missing > 0]
    return missing_data

check_missing()

In [None]:
msno.matrix(train)

In [None]:
msno.heatmap(train)

# Fill Missing Data

### YearBuilt

In [None]:
corr = train[["GarageYrBlt", "YearBuilt"]].corr()
corr

In [None]:
train["GarageYrBlt"].fillna(train["YearBuilt"],inplace=True)
test["GarageYrBlt"].fillna(test["YearBuilt"],inplace=True)

## LotFrontage

In [None]:
check_missing()

In [None]:
# sns.catplot(data=train, x='LotFrontage',kind='strip', row='LotConfig', sharex=False,height=3,orient='portrait')

# plt.show()


In [None]:
lot_front_dic = train.mask(train.LotFrontage.isna()) \
            .groupby('LotConfig') \
                ['LotFrontage'].median().round() \
                .to_dict()

lot_front_dic

In [None]:
# missing_LF = train[(train.LotConfig == key) & (train.LotFrontage.isna())]  

for key, item in lot_front_dic.items():
    train.loc[((train.LotConfig==key ) & (train.LotFrontage.isna())), 'LotFrontage'] = item
    # test.loc[((train.LotConfig==key ) & (train.LotFrontage.isna())), 'LotFrontage'] = item

In [None]:
check_missing(train)

In [None]:
train.info()

In [None]:
train.to_csv('datasets/train_clean.csv')
test.to_csv('datasets/test_clean.csv')

# Drop Columns

### Drop 8 rows of ManVnrArea

In [None]:
train.dropna(axis = 'rows', subset=["MasVnrArea"], inplace=True)

### Drop categorical columns  with more than 600 missing values

In [None]:
# t = int(train.shape[0]*.90)
# train.dropna(axis='columns', thresh= t, inplace=True)

### Drop Columns Dominated by a certain value

In [None]:
# unbalanced_cols = []
# for col in train.select_dtypes('object').columns:
#     value_counts = train[col].value_counts(normalize=True)

#     max = value_counts.max()
    
#     print(value_counts)
#    # print(max)

#     if max > .70:
#         unbalanced_cols.append(col)


In [None]:
# train.drop(unbalanced_cols, axis='columns',inplace=True)
# check_missing()

In [None]:
# train.dropna(axis=0,inplace=True)
# print(train.shape)
# check_missing()

# Feature Importance

## Correlation of Predictors with Target Variables

In [None]:
# def plot_correlation_heatmap(target_col: str or None):

#     corr_data: pd.DataFrame
    
#     if target_col is None:
corr_price = train.corr(method='pearson', numeric_only=True)['SalePrice'].sort_values(ascending=False)
#     elif isinstance(target_col, str):
#         corr_data = train.corr(method='pearson')

sns.heatmap(corr_price.to_frame())
plt.show()

# plot_correlation_heatmap('SalePrice')

## Correlation between Predictor Variables (Multicollinearity)

In [None]:
corr_df = train.corr(method='pearson', numeric_only=True).sort_values(by='SalePrice')
fig = plt.figure(figsize=(10,12))
sns.heatmap(corr_df,cmap='BrBG', figure=fig )
plt.show()

In [None]:
corr_df


## Drop highly correlated independent variables

In [None]:
to_drop = corr_df.query('OverallCond >-.80 and OverallCond >.80').index.tolist()
to_drop.append('OverallQual')
train.drop(columns=to_drop, inplace=True)
train

# Modeling

## Imports

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score


In [None]:
c =test.select_dtypes(include='object').columns.to_list()
d = train.select_dtypes(include='object').columns.to_list()

## Preprocessing Pipeline

## Encoding

In [28]:
train.query('Alley == Street')[['Alley','Street']].shape[0]

40

## Baseline Model (Linear Regression)    

In [None]:
linear_reg = LinearRegression()

linear_reg.fit(x_train, y_train)

In [None]:
for col in test.columns:
    if col not in x_train.columns:
        test.drop(axis='column', columns=col, inplace=True)
        


In [None]:
X_test_final = scale_encode(test)
check_missing(X_test_final)

In [None]:
X_test_final.dropna(inplace=True)
y_pred = linear_reg.predict(X_test_final)
pd.Series(y_pred).to_csv('submission.csv')

## Baseline Model (Decision Tree)

In [None]:
dt = DecisionTreeRegressor(criterion='squared_error')

dt.fit(x_train, y_train)

In [None]:
dt.feature_names_in_

In [None]:
dt.predict(X_test_final)