# Imports

In [35]:
import graphviz
import pandas as pd
import cufflinks as cf
from plotly.offline import iplot

from sklearn import tree as t
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


#Conect plotly and pandas
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

# Load data

In [36]:
cols = ["V" + str(i) for i in range(1,61)]
cols += ["Class"]
df = pd.read_csv("datasets/sonar.all-data.csv", names = cols)

# EDA

In [37]:
df.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V52,V53,V54,V55,V56,V57,V58,V59,V60,Class
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R
5,0.0286,0.0453,0.0277,0.0174,0.0384,0.099,0.1201,0.1833,0.2105,0.3039,...,0.0045,0.0014,0.0038,0.0013,0.0089,0.0057,0.0027,0.0051,0.0062,R
6,0.0317,0.0956,0.1321,0.1408,0.1674,0.171,0.0731,0.1401,0.2083,0.3513,...,0.0201,0.0248,0.0131,0.007,0.0138,0.0092,0.0143,0.0036,0.0103,R
7,0.0519,0.0548,0.0842,0.0319,0.1158,0.0922,0.1027,0.0613,0.1465,0.2838,...,0.0081,0.012,0.0045,0.0121,0.0097,0.0085,0.0047,0.0048,0.0053,R
8,0.0223,0.0375,0.0484,0.0475,0.0647,0.0591,0.0753,0.0098,0.0684,0.1487,...,0.0145,0.0128,0.0145,0.0058,0.0049,0.0065,0.0093,0.0059,0.0022,R
9,0.0164,0.0173,0.0347,0.007,0.0187,0.0671,0.1056,0.0697,0.0962,0.0251,...,0.009,0.0223,0.0179,0.0084,0.0068,0.0032,0.0035,0.0056,0.004,R


In [38]:
df.tail(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V52,V53,V54,V55,V56,V57,V58,V59,V60,Class
198,0.0238,0.0318,0.0422,0.0399,0.0788,0.0766,0.0881,0.1143,0.1594,0.2048,...,0.0096,0.0071,0.0084,0.0038,0.0026,0.0028,0.0013,0.0035,0.006,M
199,0.0116,0.0744,0.0367,0.0225,0.0076,0.0545,0.111,0.1069,0.1708,0.2271,...,0.0141,0.0103,0.01,0.0034,0.0026,0.0037,0.0044,0.0057,0.0035,M
200,0.0131,0.0387,0.0329,0.0078,0.0721,0.1341,0.1626,0.1902,0.261,0.3193,...,0.015,0.0076,0.0032,0.0037,0.0071,0.004,0.0009,0.0015,0.0085,M
201,0.0335,0.0258,0.0398,0.057,0.0529,0.1091,0.1709,0.1684,0.1865,0.266,...,0.012,0.0039,0.0053,0.0062,0.0046,0.0045,0.0022,0.0005,0.0031,M
202,0.0272,0.0378,0.0488,0.0848,0.1127,0.1103,0.1349,0.2337,0.3113,0.3997,...,0.0091,0.0045,0.0043,0.0043,0.0098,0.0054,0.0051,0.0065,0.0103,M
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.163,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
204,0.0323,0.0101,0.0298,0.0564,0.076,0.0958,0.099,0.1018,0.103,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
205,0.0522,0.0437,0.018,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.016,0.0029,0.0051,0.0062,0.0089,0.014,0.0138,0.0077,0.0031,M
206,0.0303,0.0353,0.049,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M
207,0.026,0.0363,0.0136,0.0272,0.0214,0.0338,0.0655,0.14,0.1843,0.2354,...,0.0146,0.0129,0.0047,0.0039,0.0061,0.004,0.0036,0.0061,0.0115,M


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      208 non-null    float64
 1   V2      208 non-null    float64
 2   V3      208 non-null    float64
 3   V4      208 non-null    float64
 4   V5      208 non-null    float64
 5   V6      208 non-null    float64
 6   V7      208 non-null    float64
 7   V8      208 non-null    float64
 8   V9      208 non-null    float64
 9   V10     208 non-null    float64
 10  V11     208 non-null    float64
 11  V12     208 non-null    float64
 12  V13     208 non-null    float64
 13  V14     208 non-null    float64
 14  V15     208 non-null    float64
 15  V16     208 non-null    float64
 16  V17     208 non-null    float64
 17  V18     208 non-null    float64
 18  V19     208 non-null    float64
 19  V20     208 non-null    float64
 20  V21     208 non-null    float64
 21  V22     208 non-null    float64
 22  V2

In [40]:
df.duplicated().sum()

0

In [41]:
df.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [42]:
X = df.drop(["Class"], axis=1)

In [78]:
skewedCols = df.skew()
skewedCols = skewedCols[skewedCols.ge(1) | skewedCols.le(-1)]

In [79]:
skewedColsList = skewedCols.index.tolist()
skewedColsList

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V8',
 'V9',
 'V10',
 'V14',
 'V38',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60']

In [81]:
X[skewedColsList].iplot(kind='histogram',subplots=True,bins=50)

# Split data

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, df["Class"], test_size=0.2, random_state=1)

# Decision tree

## Initialize object

In [50]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree

DecisionTreeClassifier()

## Prediction

In [51]:
y_pred = tree.predict(X_test)
y_pred

array(['R', 'R', 'M', 'R', 'R', 'R', 'R', 'R', 'M', 'R', 'M', 'M', 'M',
       'M', 'M', 'M', 'R', 'M', 'M', 'R', 'M', 'R', 'R', 'R', 'M', 'M',
       'R', 'M', 'R', 'R', 'R', 'M', 'R', 'M', 'R', 'R', 'R', 'R', 'R',
       'M', 'M', 'M'], dtype=object)

In [52]:
y_test

186    M
155    M
165    M
200    M
58     R
34     R
151    M
18     R
202    M
62     R
4      R
47     R
110    M
206    M
105    M
172    M
31     R
198    M
33     R
40     R
175    M
59     R
29     R
11     R
124    M
147    M
35     R
44     R
51     R
171    M
153    M
183    M
28     R
16     R
94     R
78     R
38     R
27     R
69     R
119    M
207    M
191    M
Name: Class, dtype: object

In [53]:
y_test.sort_index()

4      R
11     R
16     R
18     R
27     R
28     R
29     R
31     R
33     R
34     R
35     R
38     R
40     R
44     R
47     R
51     R
58     R
59     R
62     R
69     R
78     R
94     R
105    M
110    M
119    M
124    M
147    M
151    M
153    M
155    M
165    M
171    M
172    M
175    M
183    M
186    M
191    M
198    M
200    M
202    M
206    M
207    M
Name: Class, dtype: object

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           M       0.74      0.70      0.72        20
           R       0.74      0.77      0.76        22

    accuracy                           0.74        42
   macro avg       0.74      0.74      0.74        42
weighted avg       0.74      0.74      0.74        42



# Generate tree image

In [55]:
dot_data = t.export_graphviz(tree, out_file = None,
                            feature_names = X_test.columns,
                            class_names = ["Metal","Rock"],
                            filled = True, rounded = True,
                            special_characters = True
                            )

In [56]:
graph = graphviz.Source(dot_data, filename = "tree_output", format = "png")

In [57]:
graph.render()

'tree_output.png'