# Homework 2

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.datasets import load_iris

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from sklearn import model_selection
from sklearn import metrics

from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

%matplotlib inline

## Problem 1
*****

Load the iris sample dataset from sklearn **loadiris()** into Python using a Pandas dataframe.  Induce a set of binary Decision Trees with a minimum of 2 instances in the leaves, no splits of subsets below 5, and an maximal tree depth from 1 to 5 (you can leave the majority parameter to 95%).  Which depth values result in the highest Recall? Why? Which value resulted in the lowest Precision? Why?  Which value results in the best F1 score?  Explain the difference between the micro/macro/weighted methods of score calculation.

In [2]:
iris = load_iris()
pd_iris = pd.DataFrame(data = np.c_[iris.data, iris.target], columns = iris.feature_names+['species'])

pd_iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [3]:
data_train, data_test, target_train, target_test = model_selection.train_test_split(iris.data,
                                                                    iris.target,
                                                                    test_size=0.2)

In [49]:
for i in range(5):
    predicted = DecisionTreeClassifier(max_depth = (i+1), min_samples_leaf = 2, min_samples_split = 5).fit(data_train, target_train).predict(data_test)
    print("----------------------------Tree with depth: " + str(i+1)+" -----------------------\n")
    print(metrics.classification_report(target_test,
                                    predicted))
    print("----------------------------Confusion Matrix---------------------------\n")
    print(metrics.confusion_matrix(target_test,
                               predicted))
    print()

----------------------------Tree with depth: 1 -----------------------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.00      0.00      0.00        11
           2       0.48      1.00      0.65        10

   micro avg       0.63      0.63      0.63        30
   macro avg       0.49      0.67      0.55        30
weighted avg       0.46      0.63      0.52        30

----------------------------Confusion Matrix---------------------------

[[ 9  0  0]
 [ 0  0 11]
 [ 0  0 10]]

----------------------------Tree with depth: 2 -----------------------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.91      0.91      0.91        11
           2       0.90      0.90      0.90        10

   micro avg       0.93      0.93      0.93        30
   macro avg       0.94      0.94      0.94        30
weighted avg       0.93      0.93     

  'precision', 'predicted', average, warn_for)


## Problem 2
***
Load the **Breast Cancer Wisconsin (Diagnostic)** sample dataset from the **UCI Machine Learning Repository** (The discrete version at: **breast-cancer-wisconsin.data**)  into Python using  a  Pandas  dataframe.   Induce  a  binary Decision Tree with a minimum of 2 instances in the leaves, no splits of subsets below 5, and a maximal tree depth of 2 (use the default Gini criterion). Calculate the  Entropy,  Gini,  and  Misclassification  Error  of  the  first  split  -  what  is  the Information  Gain?   What  is  the  feature  selected  for  the  first  split,  and  what value determines the decision boundary?

In [60]:
breast = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", header=None)
breast.columns=["Sample code number","Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", 
                "Marginal Adhesion", "Single Epithelial Cell Size","Bare Nuclei", "Bland Chromatin", 
                "Normal Nucleoli", "Mitoses", "Class:"]
breast_pd = breast.drop(columns=["Sample code number"])
nan_breast_pd = breast_pd.replace(to_replace="?",value=np.nan)
nan_breast_pd.isna().sum() #All the nan values are in Bare Nuclei column

Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class:                          0
dtype: int64

In [61]:
nan_breast_pd.describe()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class:
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [72]:
breast_tree = DecisionTreeClassifier(max_depth = 2, min_samples_leaf = 2, min_samples_split = 5)
fitted_breast = breast_tree.fit(nan_breast_pd.drop(columns=["Class:","Bare Nuclei"]), nan_breast_pd["Class:"])
nan_breast_pd.columns[fitted_breast.tree_.feature] #Order of splits

Index(['Uniformity of Cell Size', 'Bland Chromatin', 'Mitoses', 'Mitoses',
       'Uniformity of Cell Shape', 'Mitoses', 'Mitoses'],
      dtype='object')

In [73]:
fitted_breast.feature_importances_

array([0.        , 0.88020446, 0.07898489, 0.        , 0.        ,
       0.        , 0.04081065, 0.        ])

In [74]:
nan_breast_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    683 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class:                         699 non-null int64
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [77]:
nan_breast_pd["Bare Nuclei"] = nan_breast_pd["Bare Nuclei"].astype(np.float64)
nan_breast_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    683 non-null float64
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class:                         699 non-null int64
dtypes: float64(1), int64(9)
memory usage: 54.7 KB


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

## Problem 3
Load the **Breast Cancer Wisconsin (Diagnostic)** sample dataset from the **UCI Machine  Learning  Repository**  (The continuous version  at: **wdbc.data**)  into Python using  a  Pandas  dataframe.   Induce  the  same  binary  Decision  Treeas above (now using the continuous data) but perform a PCA dimensionality reduction beforehand.  Using only the first principal component of the data fora model fit, what is the F1, Precision, and Recall of the PCA-based single factor model compared to the original (continuous) data?  Repeat using the first andsecond principal components.  Using the Confusion Matrix, what are the valuesfor  FP  and  TP  as  well  as  FPR/TPR?  Is  using  continuous  data  in  this  casebeneficial within the model?  How?

## Problem 4
Simulate a binary classification dataset with a single feature using a mixture of normal distributions with NumPy(Hint:  Generate two data frames with the random  number  and  a  class  label,  and  combine  them  together).   The  normal distribution  parameters  **np.random.normal**  should  be  (5,2)  and  (-5,2)  for the pair of samples.  Induce a binary Decision Tree of maximum depth 2, and obtain the threshold value for the feature in the first split.  How does this value compare to the empirical distribution of the feature?