## 2.3.1 - Loading Data an split test and training set

In [1]:
from sklearn.datasets import fetch_openml
dataset = fetch_openml("mnist_784")
X = dataset["data"]
y = dataset["target"]
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [2]:
import pandas as pd
import sklearn.model_selection as ms
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

In [3]:
X_train, X_test, y_train, y_test = ms.train_test_split(X,y, test_size = 10000)

## 2.3.2 - Build a DTClassifier and compute metrics

In [4]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [5]:
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(precision,"\n", recall,"\n", f1, "\n",support)

0.873
[0.92198582 0.94780447 0.83783784 0.85784314 0.87054027 0.82197802
 0.9000999  0.90445269 0.81766704 0.81874356] 
 [0.9164149  0.95652174 0.84221526 0.83095916 0.88132095 0.83111111
 0.90826613 0.90866729 0.78393051 0.84038055] 
 [0.91919192 0.95214315 0.84002084 0.84418717 0.87589744 0.82651934
 0.90416458 0.90655509 0.80044346 0.82942097] 
 [ 993 1196  957 1053  969  900  992 1073  921  946]


In [6]:
import random
report = classification_report(y_test, y_pred)

row_idx = random.sample(range(0,X.shape[0]+1),5)
col_idx = random.sample(range(0,X.shape[1]+1),3 )
r = X[:, col_idx]
r[row_idx, :]
y_train.shape

(60000,)

## 2.3.3 - 2.3.4

In [7]:
import random

In [165]:
class MyRandomForestClassifier():
    
    
    def __init__(self, n_estimators, max_features):
        
        self.n_estimators = n_estimators
        self.max_features = max_features
    # train the trees of this random forest using subsets of X (and y)
    
    def fit(self, X, y):
        self.random_tree = []
        self.cols_idx = []
        self.rows_idx = []
        self.importance = {}
        for i in range(self.n_estimators):
            row_idx = random.sample(range(0,X.shape[0]),int(X.shape[0]*0.625))
            col_idx = random.sample(range(0,X.shape[1]),self.max_features)
            
            self.cols_idx.append(col_idx)
            self.rows_idx.append(row_idx)
            
            yb = y[row_idx]
            Xb = X[row_idx,:]
            Xb = Xb[:, col_idx]
            clf = DecisionTreeClassifier(max_depth= 10)
            clf.fit(Xb, yb)
            self.random_tree.append(clf)
            #importance[n_estimators] = classifier.feature_importances_
            
    
    # predict the label for each point in X
    
    def predict(self, X):
        result = []
        for i in range(self.n_estimators):
            y_pred = self.random_tree[i].predict(X[:, self.cols_idx[i]])
            result.append(y_pred)
            
        return result
    
    def importance(self):
        
        return self.importance

In [168]:
import math
result = {}
importance = {}
for n_estimators in range(10,100,10):
    classifier = MyRandomForestClassifier(n_estimators, int(math.sqrt(X.shape[1])))
    classifier.fit(X_train, y_train)
    result[n_estimators] =  classifier.predict(X_test)
    

AttributeError: 'MyRandomForestClassifier' object has no attribute 'feature_importances_'

In [108]:
accuracy_dict ={}

for  k, v in (result.items()):
    #report = classification_report(y_test, y_p)
    #print(report)
    accuracy_dict[k] = []
    for value in v:
        
        accuracy_dict[k].append(accuracy_score(y_test, value))
    


In [109]:
df_result = pd.DataFrame.from_dict(accuracy_dict, orient= 'index')
df_result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
10,0.6517,0.6598,0.6189,0.5773,0.6323,0.6791,0.6694,0.6727,0.5146,0.6381,...,,,,,,,,,,
20,0.4983,0.6506,0.6016,0.5719,0.6079,0.5565,0.6396,0.6353,0.5972,0.6521,...,,,,,,,,,,
30,0.6659,0.5997,0.6304,0.6366,0.5707,0.6438,0.4882,0.6929,0.6543,0.5573,...,,,,,,,,,,
40,0.5728,0.6397,0.5883,0.6541,0.7227,0.6496,0.6808,0.6624,0.6378,0.5805,...,,,,,,,,,,
50,0.6249,0.5782,0.6406,0.6402,0.6495,0.6463,0.6107,0.6171,0.6412,0.6795,...,,,,,,,,,,
60,0.6008,0.6547,0.5895,0.6273,0.599,0.6197,0.6068,0.6724,0.5899,0.6493,...,,,,,,,,,,
70,0.61,0.6068,0.6148,0.6591,0.7143,0.6773,0.6808,0.6596,0.6598,0.7002,...,,,,,,,,,,
80,0.525,0.6625,0.6462,0.6455,0.6603,0.6251,0.567,0.6464,0.7038,0.5479,...,,,,,,,,,,
90,0.6059,0.6396,0.631,0.7233,0.6485,0.6123,0.6185,0.682,0.6631,0.6259,...,0.6855,0.6764,0.6597,0.539,0.6016,0.6475,0.595,0.7006,0.5776,0.7077


In [73]:
row_mean = df_result.mean(axis = 1)
row_mean

10    0.661250
20    0.611265
30    0.630340
40    0.636208
50    0.648864
60    0.635437
70    0.631179
80    0.626121
90    0.627954
dtype: float64

In [76]:
print(f"Max accuracy:{row_mean.max()}")

Max accuracy:0.6612500000000001


#### Importance- How do I compute it if every MyRandom is composed by different classifier, they return me the same value


In [141]:
importances = pd.DataFrame.from_dict(importance, orient = "index")
type(impoertances)

pandas.core.frame.DataFrame

In [158]:
importances.loc[10,:]

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
779    0.0
780    0.0
781    0.0
782    0.0
783    0.0
Name: 10, Length: 784, dtype: float64

In [155]:
positive_importance = {}
for row in importances.index:
    positive_importance[row] = []
    for col in importances.columns:
        if(importances.loc[row, col] >0):
            positive_importance[row].append(importances.loc[row, col])

            
df_pos_importance = pd.DataFrame.from_dict(positive_importance, orient = "index")
df_pos_importance

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,488,489,490,491,492,493,494,495,496,497
10,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
20,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
30,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
40,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
50,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
60,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
70,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
80,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05
90,3.6e-05,3.7e-05,7.3e-05,5.4e-05,0.000168,5.4e-05,7.3e-05,0.000169,0.000105,0.000126,...,0.000102,0.000208,0.000107,3.7e-05,3.4e-05,0.000339,6.3e-05,0.000168,0.000198,7.3e-05


In [156]:
df_pos_importance.iloc[0].sum()

1.0

## 2.3.5 - Random forest classifier from sklearn

In [79]:
import timeit
#timeit.timeit(lambda: apriori(df, 0.01), number=1)
from sklearn.ensemble import RandomForestClassifier

In [105]:
rnd_result = {}
for n_est in range(10,100,10):
    classifier = RandomForestClassifier(n_estimators = n_est, max_depth= 10, max_features= "sqrt")
    classifier.fit(X_train, y_train)
    rnd_result[n_est] =  classifier.predict(X_test)


RandomForestClassifier.predict() --> The predicted class of an input sample is a vote by the trees in the forest, weighted by their probability estimates. That is, the predicted class is the one with highest mean probability estimate across the trees.

In [106]:
rnd_accuracy_dict ={}

for  k, v in (rnd_result.items()):
    #report = classification_report(y_test, y_p)
    #print(report)
    rnd_accuracy_dict[k] =(accuracy_score(y_test, v))
        
rnd_df_result = pd.DataFrame.from_dict(rnd_accuracy_dict, orient= 'index')
rnd_df_result
#print(rnd_accuracy_dict)

Unnamed: 0,0
10,0.9235
20,0.935
30,0.9393
40,0.945
50,0.941
60,0.9423
70,0.9433
80,0.942
90,0.945


If we increase the number of features the accuracy grow. max_features = "auto" --> total features; ="sqrt" -->sqrt(tot); if we set 10 whic is less than sqrt(tot) the accurcy decrease but the efficency increase