In [1]:
# Gets the preprocessed data set for Organics.
import casestudy_tools as tools
df = tools.preprocess()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 37 columns):
AGE                    22223 non-null float64
BILL                   22223 non-null float64
ORGYN                  22223 non-null int64
AFFL                   22223 non-null int64
LTIME                  22223 non-null float64
GENDER_F               22223 non-null uint8
GENDER_M               22223 non-null uint8
GENDER_U               22223 non-null uint8
TV_REG_Border          22223 non-null uint8
TV_REG_C Scotland      22223 non-null uint8
TV_REG_East            22223 non-null uint8
TV_REG_London          22223 non-null uint8
TV_REG_Midlands        22223 non-null uint8
TV_REG_N East          22223 non-null uint8
TV_REG_N Scot          22223 non-null uint8
TV_REG_N West          22223 non-null uint8
TV_REG_S & S East      22223 non-null uint8
TV_REG_S West          22223 non-null uint8
TV_REG_Ulster          22223 non-null uint8
TV_REG_Wales & West    22223 non-null uint8
TV_RE

In [14]:
# Building a decision tree using the default settings.
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Sets target column to ORGYN
target_dataset = df['ORGYN']
# Removes ORGYN from the dataset in order to avoid false predictor.
dataset = df.drop(['ORGYN'], axis=1)

# Sets random state to 10. This will be kept consistently throughout the case study.
random_state = 10
# Sets the test size to be 30% of the total data set.
test_size = 0.3

# Transform the dataset into a matrix.
dataset_matrix = dataset.as_matrix()

# Splits the data into train and test sets.
dataset_train, dataset_test, target_dataset_train, target_dataset_test = train_test_split(dataset_matrix,
                                                                                          target_dataset,
                                                                                          test_size=test_size,
                                                                                          stratify=target_dataset,
                                                                                          random_state=random_state
                                                                                         )

# Training a decision tree model based on deafault settings.
decisiontree_model_def = DecisionTreeClassifier(random_state=random_state)
decisiontree_model_def.fit(dataset_train, target_dataset_train)

# Prints train and test accuracy.
print("Default Decision Tree Statistics:")
print("Train Acuracy:", decisiontree_model_def.score(dataset_train, target_dataset_train))
print("Test Acuracy:", decisiontree_model_def.score(dataset_test, target_dataset_test))

# Printing a classification report of the model.
print("")
print("Classification Report:")
target_predict = decisiontree_model_def.predict(dataset_test)
print(classification_report(target_dataset_test, target_predict))


Default Decision Tree Statistics:
Train Acuracy: 0.998585754692723
Test Acuracy: 0.7381130943452827

Classification Report:
             precision    recall  f1-score   support

          0       0.83      0.82      0.83      5015
          1       0.47      0.48      0.48      1652

avg / total       0.74      0.74      0.74      6667



In [16]:
# Evaluating the feature importance of the default_decision tree
import numpy as np

# Gets feature importance and relates to the column names of the model
feature_importances = decisiontree_model_def.feature_importances_
feature_names = dataset.columns

# Sorts the features
feature_indices = np.flip(np.argsort(feature_importances), axis=0)

# Prints the features
for i in feature_indices:
    print(feature_names[i], ':', feature_importances[i])

AGE : 0.2998210956388579
AFFL : 0.14745360754933678
BILL : 0.10846261215134576
LTIME : 0.1051955950169931
GENDER_F : 0.04435764128396906
NGROUP_C : 0.020652082351299346
NGROUP_F : 0.020411633405873066
NGROUP_B : 0.018978413418318976
NGROUP_D : 0.01848490123906868
TV_REG_Midlands : 0.01803783161674561
NGROUP_E : 0.017996652764470735
TV_REG_London : 0.016778604651857223
REGION_Midlands : 0.01297957778682568
TV_REG_N West : 0.012567941951689018
TV_REG_S & S East : 0.01242535986993021
CLASS_Silver : 0.012027807687327237
NGROUP_A : 0.011691227817154596
TV_REG_East : 0.010191688076453818
REGION_South East : 0.010181270742274302
TV_REG_Yorkshire : 0.009938115826605554
REGION_North : 0.009078298539291181
TV_REG_Wales & West : 0.009063046816459567
TV_REG_C Scotland : 0.007381700152823672
CLASS_Tin : 0.006914691572999246
CLASS_Gold : 0.006853827232143927
TV_REG_N East : 0.006252132706594195
REGION_Scottish : 0.005118019631301024
TV_REG_N Scot : 0.004112832788036579
GENDER_U : 0.00410620851731903

In [17]:
# Creates a png-file of the default decision tree in order to visualise it.
tools.visualize_decision_tree(decisiontree_model_def, feature_names, "Default Decision Tree - Task 2.png")

AttributeError: module 'casestudy_tools' has no attribute 'visualize_decision_tree'