# Sources

1. https://scikit-learn.org/stable/index.html
2. https://scikit-learn.org/stable/modules/tree.html
3. https://www.simplilearn.com/tutorials/scikit-learn-tutorial/sklearn-decision-trees
4. https://scikit-learn.org/stable/modules/ensemble.html#random-forests
5. https://scikit-learn.org/stable/modules/linear_model.html 

# Decision Tree

##### Importing the Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import pyodbc
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.tree import export_text, DecisionTreeClassifier
from sklearn.model_selection import train_test_split




conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=..\..\Data\go_sales_schoon.accdb;')
#WHY THE FUCK DOES ACCESS REQUIRE BRACKETS FOR MORE THAN ONE JOIN AND WHY DIT IT TAKE SO LONG FOR ME TO FIND THIS INFO REEEEEEEEEEEEEEEEEEEEE
sql = """
SELECT *
FROM (((((returned_item
INNER JOIN return_reason
ON returned_item.RETURN_REASON_CODE = return_reason.RETURN_REASON_CODE)
INNER JOIN order_details
ON returned_item.ORDER_DETAIL_CODE = order_details.ORDER_DETAIL_CODE)
INNER JOIN order_header
ON order_header.ORDER_NUMBER = order_details.ORDER_NUMBER)
INNER JOIN sales_sales_branch
ON sales_sales_branch.SALES_BRANCH_CODE = order_header.SALES_BRANCH_CODE)
INNER JOIN product
ON product.PRODUCT_NUMBER = order_details.PRODUCT_NUMBER)
INNER JOIN product_type
ON product_type.PRODUCT_TYPE_CODE = product.PRODUCT_TYPE_CODE
"""
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchall()

columns = [column[0] for column in cursor.description]

formatted_data = {}

for i in range(len(columns)):
    dataList = []
    for j in data:
        dataList.append(j[i])
    formatted_data[columns[i]] = dataList

df = pd.DataFrame(data=formatted_data,columns=columns)
df

##### Extracting Datasets

In [None]:
x = df.drop(columns=["RETURN_DESCRIPTION_EN","RETURN_DATE","ORDER_DETAIL_CODE","RETURN_CODE","ORDER_NUMBER","ORDER_DATE"
                     ,"ADDRESS1","ADDRESS2","REGION","RETAILER_NAME","CITY","POSTAL_ZONE","PRODUCT_NAME","PRODUCT_IMAGE",
                     "LANGUAGE","PRODUCTION_COST","PRODUCT_TYPE_EN","DESCRIPTION","INTRODUCTION_DATE","RETURN_REASON_CODE"]) #Bit cheaty 

y = df["RETURN_DESCRIPTION_EN"]

feature_names = x.columns
labels = y.unique()
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.4, random_state=42)
x

The max depth argument controls the tree's maximum depth. We use this to ensure that no overfitting is done and that we can simply see how the final result was obtained. The random state parameter assures that the results are repeatable in subsequent investigations.

##### Fitting Algorithm to Training Data

In [None]:
clf = DecisionTreeClassifier(max_depth = 100, random_state = 42)
clf.fit(X_train, y_train)

##### Checking the Algorithms

##### 1. Tree diagram

In [None]:
plt.figure(figsize=(50,50), facecolor ='k')
a = tree.plot_tree(clf, feature_names = feature_names, class_names = labels, rounded = True, filled = True, fontsize=14)
plt.show()

##### 2. As a Text-Based Diagram

In [None]:
tree_rules = export_text(clf, feature_names = list(feature_names))
print(tree_rules)

##### Make and format a confusion matrix

In [None]:
test_pred_decision_tree = clf.predict(X_test)

# Make this template a function for reusability
def generateConfusionMatrix(y_test, y_pred):
    cm = metrics.confusion_matrix(y_test, y_pred)
    matrix_df = pd.DataFrame(cm)
    ax = plt.axes()

    sns.set(font_scale=1.3)
    plt.figure(figsize=(10,7))
    sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")
    ax.set_title('Confusion Matrix - Decision Tree')
    ax.set_xlabel("Predicted label", fontsize =15)
    ax.set_xticklabels(['']+labels, rotation = 90)
    ax.set_ylabel("True Label", fontsize=15)
    ax.set_yticklabels(list(labels), rotation = 0)

    plt.show()
    

generateConfusionMatrix(y_test, test_pred_decision_tree)

# Random Forest

##### Import additional libraries and required data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
x = df.drop(columns=["RETURN_DESCRIPTION_EN","RETURN_DATE","ORDER_DETAIL_CODE","RETURN_CODE","ORDER_NUMBER","ORDER_DATE"
                     ,"ADDRESS1","ADDRESS2","REGION","RETAILER_NAME","CITY","POSTAL_ZONE","PRODUCT_NAME","PRODUCT_IMAGE",
                     "LANGUAGE","PRODUCTION_COST","PRODUCT_TYPE_EN","DESCRIPTION","INTRODUCTION_DATE","RETURN_REASON_CODE"]) #Bit cheaty 

y = df["RETURN_DESCRIPTION_EN"]

##### Fit the model

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(x, y)

##### Generate Statistics

In [None]:
y_pred = clf.predict(X_test)

# Reuse earlier confusion matrix code
generateConfusionMatrix(y_test, y_pred)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)