<a href="https://colab.research.google.com/github/HuangruiChu/ECON211/blob/main/ESG_DecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Data Wrangling

In [None]:
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics 
clf = DecisionTreeClassifier()

In [None]:
###show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df=pd.read_csv('Family Income and Expenditure.csv')

df.head()

In [None]:
df["percent"]=df['Total Food Expenditure']/df['Total Household Income']

In [None]:
df["Return"]=pd.qcut(df["percent"],3,labels=np.arange(1,4,1) )

In [None]:
df.head()

In [None]:
df['Rank']=df['Return'].apply(lambda x: 'High' if x==3 else 'Average' if x==2 else 'Low')

In [None]:
df.head()


# Build Decision Tree

In [None]:
feature_names=['Total Household Income']
features=df[feature_names]
targets=df['Rank']
targets_names = targets.unique()

In [None]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_targets, test_targets = train_test_split(features, targets, test_size=0.2, random_state=42)

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [None]:
#####Optimizing Decision Tree Performance
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion='entropy', max_depth=2,random_state=0)

# Train Decision Tree Classifer
clf = clf.fit(train_features,train_targets)

#Predict the response for test dataset
y_pred = clf.predict(test_features)

# Compute test set accuracy  
acc = accuracy_score(y_pred, test_targets)
print("Test set accuracy: {:.2f}".format(acc))

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

In [None]:
#dot_data = StringIO()
dot_data=export_graphviz(clf, out_file=None,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_names,class_names=targets_names)
graph = pydotplus.graph_from_dot_data(dot_data)  
graph.write_png('ESG.png')
Image(graph.create_png())

K-FOLD版本

In [None]:
clf.predict([[1777900]])

In [None]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=10)            # Desired number of Cross Validation folds
accuracies = list()
depth_range = range(1, 4)

# Testing max_depths from 1 to max attributes
# Uncomment prints for details about each Cross Validation pass
for depth in depth_range:
    fold_accuracy = []
    tree_model =DecisionTreeClassifier(criterion='entropy', max_depth=depth,random_state=0) 
    # print("Current max depth: ", depth, "\n")
    for train_index, test_index in cv.split(train_features):
        print('train_index', train_index, 'test_index', test_index)
        print(len(train_index))
        f_train_X = train_features.loc[train_index] # Extract train data with cv indices
        f_train_y = train_targets.loc[train_index]
        f_valid_X = test_features.loc[test_index] # Extract valid data with cv indices
        f_valid_y = test_targets.loc[test_index]


        model = tree_model.fit(X = f_train_X, 
                               y = f_train_y) # We fit the model with the fold train data
        valid_acc = model.score(X = f_valid_X , 
                                y = f_valid_y)# We calculate accuracy with the fold validation data
        fold_accuracy.append(valid_acc)

    avg = sum(fold_accuracy)/len(fold_accuracy)
    accuracies.append(avg)
    
# Just to show results conveniently
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))