<a href="https://colab.research.google.com/github/MattiaPOLI/DR14/blob/master/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Sky.csv to Sky.csv


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import IPython
import plotly
import plotly.graph_objs as go
import plotly.plotly as py
import pandas as pd
import numpy as np
import io

def enable_plotly_in_cell():
  display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
  plotly.offline.init_notebook_mode(connected=True)
  
pl_colorscale=[[0.0, '#19d3f3'],
             [0.333, '#19d3f3'],
             [0.333, '#e763fa'],
             [0.666, '#e763fa'],
             [0.666, '#636efa'],
             [1, '#636efa']]

df = pd.read_csv(io.StringIO(uploaded['Sky.csv'].decode('utf-8')))
#store the labels into a different array, we must not use them during PCA
labels = np.array(df.iloc[:,13])
classes = np.unique(df['class'].values).tolist()
class_code = {classes[k]: k for k in range(3)}
color_vals = [class_code[cl] for cl in df['class']]
#dropping columns related to the struments used for retrieving measures 
#run/rerun/camcol/field are parameters used to identify from which scan data are taken. Scans in database are accessed with the hash specobjd
#plate/mjd/fiberid are parameters related to the strumentation used
#both are probably going to spoil the result because actually not related to the celestial object
df2 = df.drop(columns=["objid", "run", "rerun", "camcol", "field", "specobjid", "plate", "mjd", "fiberid", "class"])

df2.dtypes

dfStandard = StandardScaler().fit_transform(df2)
PCA = PCA()
data_PCA = PCA.fit_transform(dfStandard)
cum_sum = PCA.explained_variance_ratio_.cumsum()
cum_sum = cum_sum * 100
bars = [go.Bar(y = cum_sum)]
enable_plotly_in_cell()
plotly.offline.iplot(bars, filename="cumVariance")

In [27]:
trace = go.Scatter(
    x = data_PCA[:, 0],
    y = data_PCA[:, 1],
    mode = "markers",
    marker = dict(
        color = color_vals,
        colorscale = pl_colorscale,
        colorbar = dict(
            title = "Labels",
            tickvals = [0, 1, 2, 3],
            ticktext = ["Galaxy", "QSO", "Star"]
        )
    )
)
fig = go.Figure(data = [trace])
enable_plotly_in_cell()
plotly.offline.iplot(fig, filename="1vs2")

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

fifthPCA = data_PCA[:, 0:5]

trainSet, test, labelTrainSet, labelTest = train_test_split(fifthPCA, labels, train_size = 0.7, test_size = 0.3, random_state = 40)

numberTrees = [50, 100, 200, 300, 350, 400, 500]
numberFeatures = [2, 3, 4, 5]
accuracy = np.zeros(shape = (len(numberTrees), len(numberFeatures)))

for i in range(len(numberTrees)):
  for j in range(len(numberFeatures)):
    RFC = RandomForestClassifier(n_estimators = numberTrees[i], max_depth=2, random_state=0, oob_score = True,
                                max_features = numberFeatures[j], bootstrap = True)
    RFC.fit(trainSet, labelTrainSet)
    accuracy[i][j] = RFC.oob_score_
    
trace = go.Heatmap(
    z = accuracy,
    colorscale = "Electric"
)
layout = go.Layout(
    title = "OOB error",
    xaxis = dict(
        tickmode = "array",
        tickvals = np.linspace(0, len(numberFeatures), num = len(numberFeatures) + 1),       
        ticktext = ["2", "3", "4", "5"],
        title = "N° Features"      
    ),
    yaxis = dict(
        tickmode = "array",
        tickvals = np.linspace(0, len(numberTrees), num = len(numberTrees) + 1),
        ticktext = ["10", "50", "100", "200", "300", "400", "500"],
        title = "N° Tree"
    )
)
data = [trace]
figure = go.Figure(data = data, layout = layout)
enable_plotly_in_cell()
plotly.offline.iplot(figure, filename="rbfAccuracy")  



In [51]:
depth = [2, 5, 10, 20, 50]
accuracy2 = np.zeros(shape = (len(numberTrees), len(depth)))

for i in range(len(numberTrees)):
  for j in range(len(depth)):
    RFC = RandomForestClassifier(n_estimators = numberTrees[i], max_depth = depth[j], random_state=0, oob_score = True,
                                max_features = 2, bootstrap = True)
    RFC.fit(trainSet, labelTrainSet)
    accuracy2[i][j] = RFC.oob_score_
    
trace = go.Heatmap(
    z = accuracy2,
    colorscale = "Electric"
)
layout = go.Layout(
    title = "OOB error",
    xaxis = dict(
        tickmode = "array",
        tickvals = np.linspace(0, len(depth), num = len(depth) + 1),       
        ticktext = ["2", "5", "10", "20", "50"],
        title = "Max Depth"     
    ),
    yaxis = dict(
        tickmode = "array",
        tickvals = np.linspace(0, len(numberTrees), num = len(numberTrees) + 1),
        ticktext = ["10", "50", "100", "200", "300", "400", "500"],
        title = "N° Tree"
    )
)
data = [trace]
figure = go.Figure(data = data, layout = layout)
enable_plotly_in_cell()
plotly.offline.iplot(figure, filename="rbfAccuracy") 