In [None]:
import numpy as np #obliczenia numeryczne
import pandas as pd #struktura dataframe
import matplotlib.pyplot as plt #do rysowania wykresow, macierz korelacji
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer

In [None]:
train_df = pd.read_csv('dataset/train.csv', index_col='Id')

test_df = pd.read_csv('dataset/test.csv', index_col='Id')

In [None]:
#tworze kopie i zmienne dla X_prep i y_prep, aby skorzystac z Mutual information przed wyborem modelu i wlasciwym trenowaniem
#aby uniknac nadpisania danych, na ktroych docelowo bede pracowac

X_prep = train_df.copy()
y_prep = X_prep.pop('SalePrice')

In [None]:
#wyswietlam info ile jest brakujacych danych dla danej kolumny


print(test_df.isnull().sum().sort_values(ascending=False).head(40))

#wszystkich danych (domow) jest 1460
#PoolQC - Pool Quality (kat) - NA(1453) znaczy No Pool, wiec nie sa to brakujace dane
#MiscFeature - rozne cechy nie ujete w innych kategoriach (kat) - np. winda; NA oznacza brak takich cech, wiec cos nam to mowi
#Alley: Type of alley access to property; NA - brak dostepu do alei dojazdowej
#Fence - NA - brak ogrodzenia
#FireplaceQu - Fireplace quality; NA - no fireplace
#LotFrontage (LICZBOWE) - Stopy liniowe ulicy połączonej z nieruchomością; NA to brak ulicy? raczej brak danych -> most frequent
#GARAGE - wszystkie brakujace dane w year building dotycza braku garazu; 
#BsmtFinType2 - ocena powierzchni wykonczonej piwnicy; NA - brak piwnicy
#BsmtExposure - ekspozycja piwnicy (NA - brak piwnicy)
#BsmtCond i te 2 ponizej - NA - brak piwnicy; 
#MasVnrArea i Type- none - brak forniru murowanego + jego powierzchnia (liczbowy) - tutaj brakuje danych, bo na brak forniru jest
#none a nie n/a; dobrze wypelnic najczesciej wystepujacymi wartosciami
#Electrical - dana jakosciowa; najlepiej wypelnic najczesciej wystepujaca 

#jakosciowe kolumny, ktore trzeba uzupelnic o najczesciej wystepujace wartosci:
#1. MasVnrType 
#2. Electrical

#ilosciowe, ktore trzeba uzupelnic o najczesciej wystepujace wartosci:
#1. LotFrontage
#2. GarageYrBlt
#3. MasVnrArea

In [None]:
print(X_prep.shape) 

In [None]:
#korzystam z Mutual information zeby sprawdzic, ktore atrybuty najsilniej wplywaja na y


# koduje atykiety dla wartości kategorycznych
for colname in X_prep.select_dtypes("object"):
    X_prep[colname], _ = X_prep[colname].factorize()

# sprawdzam czy w X_prep sa same wartości całkowite (musza byc zeby skorzystac z MI)
discrete_features = X_prep.dtypes == int


In [None]:
discrete_features

In [None]:
print(X_prep.isnull().sum().sort_values(ascending=False).head(20))

In [None]:
# pozbywam sie brakujacych danych poprzez imputacje - wypelniam brakujace wartosci najczesciej wystepujaca w danej kolumnie
#powinno byc dla jakosciowych i ilosciowych na raz

# Imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_prep = pd.DataFrame(my_imputer.fit_transform(X_prep))

# Imputacja usuwa nazwy kolumn, dlatego je przywracam
imputed_X_prep.columns = X_prep.columns


In [None]:
all(discrete_features)

In [None]:
for colname in imputed_X_prep.select_dtypes('float64'):
    print(colname)

In [None]:
X_prep = imputed_X_prep.astype(int)

In [None]:
X_prep.dtypes

In [None]:
def compute_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X_prep, y_prep, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = compute_mi_scores(X_prep, y_prep, discrete_features)
mi_scores.head()  

In [None]:
import seaborn as sns

plt.style.use("seaborn-whitegrid")

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 20))
plot_mi_scores(mi_scores)

In [None]:
X_prep['SalePrice'] = y_prep

In [None]:
#usuwam kolumny z niskim MI (później będę sprawdzać, czy usunięcie danej kolumny dobrze wpływało na wynik modelu)
col_to_delete = ['PoolQC', 'MoSold', 'PoolArea', 'BsmtFinSF2', 'LowQualFinSF', 'Exterior1st', 'Utilities']

for col in col_to_delete:
    train_df.drop(col, axis = 1, inplace= True)
    test_df.drop(col, axis = 1, inplace= True)

In [None]:
#wyswietlam info ile jest brakujacych danych dla danej kolumny dla danych treningowych


print(train_df.isnull().sum().sort_values(ascending=False).head(20))

In [None]:
#wyswietlam info ile jest brakujacych danych dla danej kolumny dla danych testowych


print(test_df.isnull().sum().sort_values(ascending=False).head(40))

In [None]:
y = train_df.SalePrice
train_full = train_df.drop(['SalePrice'], axis = 1)

In [None]:
# W kolumnie GarageYearBlt wszystkie brakujące wartości są związane z brakiem garażu (nie ma garażu – nie ma roku budowy garażu). 
# Ponieważ nie mogę uzupełnić tych brakujących wartości wartościami ‚0’, wybiorę modę. Nie jest to jednak idealnie rozwiązanie, 
# dlatego przygotuje dla modelu dodatkową kolumnę z informacją tylko o istnieniu garażu, aby podkreślić, że brak garażu występuje 
# tylko przy wartościach most_frequent dla kolumny GarageYearBlt


train_full['GarageBlt'] = train_full['GarageYrBlt'] > 0

In [None]:
test_df['GarageBlt'] = test_df['GarageYrBlt'] > 0


In [None]:
# Imputation
cols_miss_val = ['MasVnrType', 'MSZoning', 'LotFrontage', 'GarageYrBlt', 'MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath', 
                 'Exterior2nd', 'KitchenQual', 'SaleType', 'GarageArea', 'GarageCars', 'BsmtUnfSF', 'BsmtFinSF1',
                'TotalBsmtSF']

my_imputer = SimpleImputer(strategy='most_frequent')
imputed_train_full = pd.DataFrame(my_imputer.fit_transform(train_full[cols_miss_val]))
imputed_test_df = pd.DataFrame(my_imputer.transform(test_df[cols_miss_val]))

# Imputation removed column names; put them back
imputed_train_full.columns = cols_miss_val
imputed_test_df.columns = cols_miss_val



In [None]:
train_full = train_full.drop(cols_miss_val, axis=1)

In [None]:
test_df = test_df.drop(cols_miss_val, axis=1)

In [None]:
train_full.reset_index(drop=True, inplace=True)
imputed_train_full.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
imputed_test_df.reset_index(drop=True, inplace=True)
train_df_concat = pd.concat([train_full, imputed_train_full], axis=1)
test_df_concat = pd.concat([test_df, imputed_test_df], axis=1)
train_df_concat

In [None]:
train_df_concat.LotFrontage = train_df_concat.LotFrontage.astype('float64')
train_df_concat.GarageYrBlt = train_df_concat.GarageYrBlt.astype('float64')
train_df_concat.MasVnrArea = train_df_concat.MasVnrArea.astype('float64')
train_df_concat.BsmtFullBath = train_df_concat.BsmtFullBath.astype('float64')
train_df_concat.BsmtHalfBath = train_df_concat.BsmtHalfBath.astype('float64')

test_df_concat.LotFrontage = test_df_concat.LotFrontage.astype('float64')
test_df_concat.GarageYrBlt = test_df_concat.GarageYrBlt.astype('float64')
test_df_concat.MasVnrArea = test_df_concat.MasVnrArea.astype('float64')
test_df_concat.BsmtFullBath = test_df_concat.BsmtFullBath.astype('float64')
test_df_concat.BsmtHalfBath = test_df_concat.BsmtHalfBath.astype('float64')

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in train_df_concat.columns if train_df_concat[cname].nunique() < 10 and 
                        train_df_concat[cname].dtype == "object"]


In [None]:
categorical_cols

In [None]:
numerical_cols = [cname for cname in train_df_concat.columns if train_df_concat[cname].dtype in ['int64', 'float64']]


In [None]:
numerical_cols

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_df_concat[categorical_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(test_df_concat[categorical_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_df_concat.index
OH_cols_test.index = test_df_concat.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_df_concat[numerical_cols]#train_df_concat.drop(categorical_cols, axis=1)
num_X_test = test_df_concat[numerical_cols]#test_df_concat.drop(categorical_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)


In [None]:
OH_X_test

In [None]:
OH_X_train

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(OH_X_train, y, train_size=0.8, test_size=0.2, random_state=1)

#ODTAD DANE PRZYGOTOWANE DO DALSZYCH MODELI

In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor




In [None]:
model = XGBRegressor(n_estimators=2000, learning_rate = 0.01, n_jobs=2)


In [None]:
# Preprocessing of training data, fit model 
model.fit(X_train, y_train,
          verbose=False)

In [None]:
from sklearn.metrics import mean_absolute_error

# Preprocessing of validation data, get predictions
preds = model.predict(X_valid)

# Evaluate the model
print("Mean Absolute Error: " + str(mean_absolute_error(preds, y_valid)))

In [None]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(model, OH_X_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

In [None]:
#przygotowujemy plik do submitu


In [None]:
final_X_test = OH_X_test 

In [None]:
test_df = pd.read_csv('dataset/test.csv')


In [None]:
cols_for_testing = test_df.Id

In [None]:


# Fill in the line below: get test predictions
preds_test = model.predict(final_X_test)



# Save test predictions to file
output = pd.DataFrame({'Id': cols_for_testing,
                       'SalePrice': preds_test})
output.to_csv('dataset/XGBoost_results.csv', index=False)

In [None]:
#koniec xgboost, teraz sequential model

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import regularizers


In [None]:
def HousePrices():
    model = models.Sequential()
    model.add(layers.Dense(128, activation = 'relu', kernel_regularizer = regularizers.l2(0.001), input_shape=(237,)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Dense(64, activation = 'relu', kernel_regularizer = regularizers.l2(0.001)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(0.001)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Dense(16, activation = 'relu', kernel_regularizer = regularizers.l2(0.001)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Dense(1, activation = 'linear'))
    model.compile(optimizer = 'adam', loss = 'mse', metrics = ['accuracy'])
    return model

In [None]:
model_sequential = HousePrices()

In [None]:
np.array(X_train).shape


In [None]:
np.array(y_train).reshape(1168,1).shape


In [None]:
history = model_sequential.fit(np.array(X_train), np.array(y_train), epochs = 45, batch_size = 64, validation_split =0.2)

In [None]:
preds_valid = model_sequential.predict(np.array(X_valid))

In [None]:
score = mean_absolute_error(y_valid, preds_valid)
print('MAE:', score)

In [None]:
preds_test = model_sequential.predict(np.array(final_X_test)) 

In [None]:
preds_test.reshape(1459).shape

In [None]:
# Save test predictions to file

output = pd.DataFrame({'Id': cols_for_testing,
                       'SalePrice': preds_test.reshape(1459)})
output.to_csv('dataset/sequential_model_results.csv', index=False)

In [None]:
#Random forest dodaje z plikyu random_forest (z gita)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [None]:
def get_score(n_estimators):
    """Return the average MAE over 5 CV folds of random forest model.
    Keyword argument:
    n_estimators -- the number of trees in the forest
    """ 
    my_pipeline = Pipeline(steps=[
        ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=1))
    ])
        
    score = -1 * cross_val_score(my_pipeline, OH_X_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    return score.mean()

In [None]:
results = {} 
keys = [50, 200, 350, 500, 650, 800, 950, 1100, 1250, 1400]
for key in keys:
    results[key] = get_score(key)
    

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(list(results.keys()), list(results.values()))
plt.ylabel("error")
plt.xlabel("n_estimators (number of trees)")
plt.show()

In [None]:
model_rf = RandomForestRegressor(n_estimators=1300, random_state=1)

In [None]:
my_pipeline = Pipeline(steps=[('model', model_rf)
                             ])

In [None]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

In [None]:
# Fill in the line below: get test predictions
preds_test = my_pipeline.predict(final_X_test)



# Save test predictions to file
output = pd.DataFrame({'Id': cols_for_testing,
                       'SalePrice': preds_test})
output.to_csv('dataset/random_forest_results.csv', index=False)