In [92]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn_evaluation import plot
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, cross_validate
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
train_data_path = "train_dummy.csv"
test_data_path = "test_dummy.csv"

In [4]:
train_data = pd.read_csv(train_data_path, low_memory=False)
test_data = pd.read_csv(test_data_path, low_memory=False)

In [5]:
#Scaling
scaler = MinMaxScaler()
scaler.fit(train_data)
scaled_train = scaler.transform(train_data)
scaled_train_data = pd.DataFrame(scaled_train, columns=train_data.columns)
scaled_test = scaler.transform(test_data)
scaled_test_data = pd.DataFrame(scaled_test,columns=test_data.columns)

In [None]:
scaled_train_data.shape

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x=scaled_train_data["LotArea"], y=scaled_train_data['SalePrice'], )
plt.xlabel('Lot Area')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x=scaled_train_data["LotArea"], y=scaled_train_data['SalePrice'], )
plt.xlabel('Lot Area')
plt.ylabel('Price')
plt.show()

In [6]:
#Vypisanie najviac korelujucich hodnot
corr = train_data.corr().abs()
highest_corr = corr.unstack()
sorted_highest_corr = highest_corr.sort_values(ascending=False).drop_duplicates()

In [None]:
fig = px.imshow(corr)
fig.write_html("corr_matrix.html")

In [None]:
scaled_train_data["LotArea"].hist()

In [38]:
X_train = scaled_train_data.drop('SalePrice', axis=1)
y_train = scaled_train_data['SalePrice']
X_columns = scaled_train_data.drop('SalePrice', axis=1).columns
X_test = scaled_test_data.drop('SalePrice', axis=1)
y_test = scaled_test_data['SalePrice']

In [None]:
print(np.shape(X_train))
print(np.shape(y_train))

In [None]:
param_grid = {'max_features': ['sqrt', 'log2',1.0],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['squared_error', 'absolute_error'],
              'min_samples_leaf': [1,2,3,4]
             }

In [None]:
regressor = DecisionTreeRegressor()
grid_search = GridSearchCV(estimator=regressor,
                           param_grid=param_grid,
                           scoring=["r2","neg_mean_squared_error"],
                           refit="r2",
                           cv=5, verbose=4)
grid_search.fit(X_train, y_train)

In [None]:
print("BEST ESTIMATOR: " + str(grid_search.best_estimator_))
print("BEST SCORE: " + str(grid_search.best_score_))

In [None]:
tree_results = pd.DataFrame(grid_search.cv_results_)
tree_results = tree_results.sort_values("rank_test_r2")
tree_results.to_csv("tree_results.csv")

In [None]:
best_tree_regressor = DecisionTreeRegressor(ccp_alpha=0.001, criterion='absolute_error', max_depth=5,min_samples_leaf=2)
best_tree_regressor.fit(X_train, y_train)

In [None]:
y_pred = best_tree_regressor.predict(X_test)
y_true = y_test
plot.residuals(y_true, y_pred)

In [None]:
tree_test_results = cross_validate(best_tree_regressor,X_test,y_test,scoring=["r2","neg_mean_squared_error"])
tree_test_results

In [None]:
text_representation = tree.export_text(best_tree_regressor)
feature_names = list(scaled_train_data.columns.values)

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(best_tree_regressor,feature_names= feature_names,
                   filled=True)
fig.savefig('decisionTree.png')

In [None]:
param = {'kernel' : ('linear', 'poly', 'rbf'),'C' : [1,5,10],'gamma' : (0.1,0.01,0.001)},

svrGridSearch = GridSearchCV(estimator=SVR(),param_grid=param,
                             cv=5,
                             verbose=4,)

svrGridSearch.fit(X_train,y_train)

In [None]:
grid_scores = svrGridSearch.cv_results_
tree_results = pd.DataFrame(grid_scores)
#tree_results = tree_results.sort_values("rank_test_r2")
tree_results.to_csv("svm_results.csv")

In [None]:
print("BEST ESTIMATOR: " + str(svrGridSearch.best_estimator_))
print("BEST SCORE: " + str(svrGridSearch.best_score_))
print("BEST PARAMETERS" + str(svrGridSearch.best_params_))

In [None]:
tree_test_results = cross_validate(best_tree_regressor,X_test,y_test,scoring=["r2","neg_mean_squared_error"])
tree_test_results

In [None]:
svrGridSearchResults = svrGridSearch.cv_results_
ax = plot.grid_search(svrGridSearch.cv_results_, change="gamma", kind='bar', sort=False)

In [None]:
from sklearn_evaluation import plot
bestSvr = SVR(C=1,gamma='auto',kernel='rbf', verbose=False)
bestSvr.fit(X_train,y_train)

In [None]:
y_pred = bestSvr.predict(X_test)
y_true = y_test
plot.residuals(y_true, y_pred)

In [None]:
svr_test_results = cross_validate(bestSvr,X_test,y_test,scoring=["r2","neg_mean_squared_error"], verbose=False)
svr_test_results

In [None]:
parameters = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [1,2,3,4],
}
randomForestRegressor = RandomForestRegressor(random_state=0)

grid_search = GridSearchCV(estimator=randomForestRegressor,
                           param_grid=parameters,
                           scoring=["r2","neg_mean_squared_error"],
                           refit="r2",
                           cv=5, verbose=4)
grid_search.fit(X_train, y_train)

In [None]:
print("BEST FOREST ESTIMATOR: " + str(grid_search.best_estimator_))
print("BEST SCORE: " + str(grid_search.best_score_))
print("BEST PARAMETERS" + str(grid_search.best_params_))

In [None]:
bestRandomForest = RandomForestRegressor(max_depth=4,n_estimators=150, random_state=0)
bestRandomForest.fit(X_train, y_train)

In [None]:
y_pred = bestRandomForest.predict(X_test)
y_true = y_test
plot.residuals(y_true, y_pred)

In [None]:
forest_test_results = cross_validate(bestRandomForest,X_test,y_test,scoring=["r2","neg_mean_squared_error"], verbose=False)
forest_test_results

In [None]:
feat_importances = pd.Series(bestRandomForest.feature_importances_, index=X_train.columns).sort_values()
feat_importances.nlargest(20).plot(kind='barh')

In [None]:
fig = px.bar(feat_importances, orientation='h')
fig.write_html('importances.html')

In [None]:
X = X_test
y = y_test

# Condition the model on sepal width and length, predict the petal width
y_pred = bestSvr.predict(X)

fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'}, title="Prediction vs Expected SVR")
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y.min(), y0=y.min(),
    x1=y.max(), y1=y.max()
)
fig.show()

In [None]:
import plotly.express as px

df = scaled_test_data

# Condition the model on sepal width and length, predict the petal width
df['prediction'] = bestSvr.predict(X_test)
df['residual'] = df['prediction'] - scaled_test_data['SalePrice']

fig = px.scatter(
    df, x='prediction', y='residual',
    marginal_y='violin', trendline='ols', title="Residual SVR"
)
fig.show()

In [None]:
import plotly.express as px

df = scaled_test_data

# Condition the model on sepal width and length, predict the petal width
df['prediction'] = bestRandomForest.predict(X_test)
df['residual'] = df['prediction'] - scaled_test_data['SalePrice']

fig = px.scatter(
    df, x='prediction', y='residual',
    marginal_y='violin', trendline='ols', title="Residual RandomForrest"
)
fig.show()

In [None]:
import plotly.express as px

df = scaled_test_data

# Condition the model on sepal width and length, predict the petal width
df['prediction'] = best_tree_regressor.predict(X_test)
df['residual'] = df['prediction'] - scaled_test_data['SalePrice']

fig = px.scatter(
    df, x='prediction', y='residual',
    marginal_y='violin', trendline='ols', title="Residual Tree"
)
fig.show()

In [None]:
fig = px.scatter_3d(train_data, x='TotRmsAbvGrd', y='GrLivArea', z='YearBuilt',
                    color='SalePrice', symbol='GarageCars')
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=0,
                                          ticks="outside"))
fig.update_layout(legend=dict(title_font_family="Times New Roman",
                              font=dict(size= 20)
))
fig.write_html('3d.html')

In [96]:
import umap
X = X_train
reducer = umap.UMAP(n_components=3, min_dist=0.1, n_neighbors=50).fit(X)
umap_train_data = reducer.transform(X)

In [97]:
umap_train_data

array([[-1.121225  ,  9.44216   ,  8.953572  ],
       [ 2.6759725 ,  6.4464073 ,  8.116978  ],
       [ 2.2482646 ,  6.9382234 ,  8.014394  ],
       ...,
       [-0.83388287, 10.246829  ,  8.005696  ],
       [ 2.222857  ,  6.333964  ,  7.389529  ],
       [-2.49231   , 10.547091  ,  7.8102474 ]], dtype=float32)

In [99]:
df_umap = pd.DataFrame(umap_train_data)
df_umap[['SalePrice','GarageCars']] = train_data[["SalePrice","GarageCars"]]
fig = px.scatter_3d(df_umap, x=0, y=1, z=2,
                    color='SalePrice', symbol='GarageCars')
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=0,
                                          ticks="outside"))
fig.update_layout(legend=dict(title_font_family="Times New Roman",
                              font=dict(size= 20)
))
fig.write_html('3d_umap.html')

In [95]:
X = X_train
pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=scaled_train_data['SalePrice'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.write_html('3d_pca.html')

In [None]:
for i in range (0,255,5):
    if i == 1:
        continue
    pca = PCA(n_components=i, random_state=2020)
    components = pca.fit_transform(scaled_train_data)
    print("VARIANCE EXPLAINED BY ALL " + str(i) + " PRINCIPAL COMPONENTS = " + str(sum(pca.explained_variance_ratio_ *100)))

In [49]:
most_corr_columns = sorted_highest_corr[:1000].reset_index()
most_corr =[]
reducted_train_data = X_train.copy()
reducted_test_data = X_test.copy()
for row in most_corr_columns['level_0']:
    if row in reducted_train_data.columns:
        reducted_train_data = reducted_train_data.drop(row, axis=1)
        reducted_test_data = reducted_test_data.drop(row, axis=1)
        most_corr.append(row)

updated_most_corr = reducted_train_data.corr().abs().unstack().sort_values(ascending=False).drop_duplicates()
least_corr_train = X_train.drop(columns = most_corr)
most_corr_train = X_train.drop(columns = least_corr_train.columns)

least_corr_test = X_test.drop(columns = most_corr)
most_corr_test = X_test.drop(columns = least_corr_test.columns)

print("POCET NAJVIAC KORELUJUCICH STLPCOV PRI 1000 HODNOTACH ", len(most_corr_train.columns))
print("POCET NAJMENEJ KORELUJUCICH STLPCOV PRI 1000 HODNOTACH ", len(least_corr_train.columns))
print(least_corr_train.shape)
print(most_corr_train.shape)
print(least_corr_test.shape)
print(most_corr_test.shape)

POCET NAJVIAC KORELUJUCICH STLPCOV PRI 1000 HODNOTACH  186
POCET NAJMENEJ KORELUJUCICH STLPCOV PRI 1000 HODNOTACH  69
(940, 69)
(940, 186)
(154, 69)
(154, 186)


In [50]:
#2
pca2 = PCA(n_components=2, random_state=2020)
pca2.fit(most_corr_train)
train_components2 = pca2.transform(most_corr_train)
test_components2 = pca2.transform(most_corr_test)
print("VARIANCE EXPLAINED BY ALL " + str(2) + " PRINCIPAL COMPONENTS = " + str(sum(pca2.explained_variance_ratio_ *100)))

VARIANCE EXPLAINED BY ALL 2 PRINCIPAL COMPONENTS = 26.222433433741408


In [51]:
#3
pca3 = PCA(n_components=3, random_state=2020)
pca3.fit(most_corr_train)
train_components3 = pca3.transform(most_corr_train)
test_components3 = pca3.transform(most_corr_test)
print("VARIANCE EXPLAINED BY ALL " + str(2) + " PRINCIPAL COMPONENTS = " + str(sum(pca3.explained_variance_ratio_ *100)))

VARIANCE EXPLAINED BY ALL 2 PRINCIPAL COMPONENTS = 30.49236278370278


In [52]:
#5
pca5 = PCA(n_components=5, random_state=2020)
pca5.fit(most_corr_train)
train_components5 = pca5.transform(most_corr_train)
test_components5 = pca5.transform(most_corr_test)
print("VARIANCE EXPLAINED BY ALL " + str(5) + " PRINCIPAL COMPONENTS = " + str(sum(pca5.explained_variance_ratio_ *100)))

VARIANCE EXPLAINED BY ALL 5 PRINCIPAL COMPONENTS = 37.76872793938907


In [53]:
#10
pca10 = PCA(n_components=10, random_state=2020)
pca10.fit(most_corr_train)
train_components10 = pca10.transform(most_corr_train)
test_components10 = pca10.transform(most_corr_test)
print("VARIANCE EXPLAINED BY ALL " + str(10) + " PRINCIPAL COMPONENTS = " + str(sum(pca10.explained_variance_ratio_ *100)))

VARIANCE EXPLAINED BY ALL 10 PRINCIPAL COMPONENTS = 50.624255399688536


In [54]:
#15
pca15 = PCA(n_components=15, random_state=2020)
pca15.fit(most_corr_train)
train_components15 = pca15.transform(most_corr_train)
test_components15 = pca15.transform(most_corr_test)
print("VARIANCE EXPLAINED BY ALL " + str(15) + " PRINCIPAL COMPONENTS = " + str(sum(pca15.explained_variance_ratio_ *100)))

VARIANCE EXPLAINED BY ALL 15 PRINCIPAL COMPONENTS = 60.070255944745504


In [55]:
pca30 = PCA(n_components=30, random_state=2020)
pca30.fit(most_corr_train)
train_components30 = pca30.transform(most_corr_train)
test_components30 = pca30.transform(most_corr_test)
print("VARIANCE EXPLAINED BY ALL " + str(30) + " PRINCIPAL COMPONENTS = " + str(sum(pca30.explained_variance_ratio_ *100)))

VARIANCE EXPLAINED BY ALL 30 PRINCIPAL COMPONENTS = 77.5127873512755


In [56]:
pca60 = PCA(n_components=60, random_state=2020)
pca60.fit(most_corr_train)
train_components60 = pca60.transform(most_corr_train)
test_components60 = pca60.transform(most_corr_test)
print("VARIANCE EXPLAINED BY ALL " + str(60) + " PRINCIPAL COMPONENTS = " + str(sum(pca60.explained_variance_ratio_ *100)))

VARIANCE EXPLAINED BY ALL 60 PRINCIPAL COMPONENTS = 92.6132043045935


In [57]:
train_reductions = [train_components2,train_components3,train_components5,train_components10,train_components30,train_components60]

In [58]:
test_reductions = [test_components2,test_components3,test_components5,test_components10,test_components30,test_components60]

(154, 2)

In [86]:
def train_with_reduction(train_p,test_p):
    reducted_train_data_pca = least_corr_train.copy()
    pca_reduction = pd.DataFrame(train_p)
    ready_train = pd.concat([reducted_train_data_pca,pca_reduction], axis=1, join='inner')
    ready_train.columns = ready_train.columns.map(str)
    bestRandomForestPca = RandomForestRegressor(max_depth=4,n_estimators=150, random_state=2020)
    bestRandomForestPca.fit(ready_train, y_train)
    test_result_Best_random_forest_pca = cross_validate(bestRandomForestPca,test_p,y_test,scoring=["r2","neg_mean_squared_error"])
    print("Pocet priznakov: " + str(len(ready_train.columns)) + " Pocet dimenzii redukovanej podmnoziny: " + str(len(pca_reduction.columns)))
    print(test_result_Best_random_forest_pca)
    print('\n')
    return [len(pca_reduction.columns),test_result_Best_random_forest_pca]


In [90]:
fit_times = []
dimensions = []
r2score = []
for (train, test) in zip(train_reductions, test_reductions):
     result = train_with_reduction(train,test)
     fit_times.append(result[1]['fit_time'].mean())
     dimensions.append(result[0])
     r2score.append(result[1]['test_r2'].mean())
print(fit_times)
print(dimensions)
print(r2score)

Pocet priznakov: 71 Pocet dimenzii redukovanej podmnoziny: 2
{'fit_time': array([0.23605299, 0.22405005, 0.11002493, 0.10802507, 0.10702348]), 'score_time': array([0.14103222, 0.01100278, 0.00700235, 0.0070014 , 0.00800204]), 'test_r2': array([0.27718441, 0.44501604, 0.49367965, 0.55188087, 0.34811876]), 'test_neg_mean_squared_error': array([-0.00466225, -0.0075442 , -0.00460516, -0.00620766, -0.00892609])}


Pocet priznakov: 72 Pocet dimenzii redukovanej podmnoziny: 3
{'fit_time': array([0.11002564, 0.11702514, 0.11002493, 0.10502291, 0.11302567]), 'score_time': array([0.00900316, 0.00900197, 0.00700188, 0.00800252, 0.00700164]), 'test_r2': array([0.33205729, 0.57753427, 0.52782566, 0.75713148, 0.66700028]), 'test_neg_mean_squared_error': array([-0.00430831, -0.00574281, -0.00429459, -0.00336438, -0.00455971])}


Pocet priznakov: 74 Pocet dimenzii redukovanej podmnoziny: 5
{'fit_time': array([0.12302804, 0.10902476, 0.10802484, 0.11402559, 0.10702443]), 'score_time': array([0.0070014 

In [94]:
df = pd.DataFrame(list(zip(dimensions, fit_times, r2score)),
               columns =['dimension', 'fit_time','r2score'])
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=dimensions, y=fit_times, name="Mean Fit time"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=dimensions, y=r2score, name="Mean R2 score"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="R2 Score with Fit times based on reductions"
)

# Set x-axis title
fig.update_xaxes(title_text="Dimensions")

# Set y-axes titles
fig.update_yaxes(title_text="Fit time", secondary_y=False)
fig.update_yaxes(title_text="R2 score", secondary_y=True)

fig.write_html('reducted_dimension.html')