In [1]:
## Step 0. Import required libraries.
import mysql.connector
import os
import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.charts import Bar
from bokeh.io import show, output_notebook
from bokeh.models import Legend, Range1d, HoverTool, ColumnDataSource, NumeralTickFormatter
from bokeh.palettes import Spectral5
from sklearn import tree, base
from sklearn.tree import export_graphviz
from sklearn.feature_extraction import DictVectorizer
from collections import defaultdict
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from IPython.display import HTML

In [2]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [3]:
## SECTION 1. Which is the distribution of grades for all product types.

## Step 1. Read the server credentials from the settings file.
filePath = os.path.join(os.getcwd(), 'Settings')
with open(os.path.join(filePath,'settingsSQL.txt'), 'r') as settingsFile:
    content = settingsFile.readlines()
content = [item.strip().split(' = ')[1] for item in content]
## Step 2. Connection to the database.
conn = mysql.connector.connect(host = content[0],
                               user = content[1],
                               password = content[2],
                               db= content[3])
cursor = conn.cursor()
## Step 3. Query to the database.
cursor.execute(r'SELECT * FROM product \
                INNER JOIN product_type ON product.product_type_idproduct_type = product_type.idproduct_type\
                INNER JOIN process_history ON product.idproduct = process_history.product_idproduct \
                INNER JOIN visual_inspection ON process_history.idprocess = visual_inspection.process_history_idprocess;')
rows = cursor.fetchall()
connectionClosed = cursor.close()

In [4]:
## Step 4. Construct a dataframe with the results.
columnNames = ['idproduct', 'br_number_idbr_number', 'manufacturer_idmanufacturer', 'product_type_idproduct_type', 
               'box_idbox', 'model_idmodel', 'serialnumber', 'decomm_id', 'customer_ref', 'asset_tag', 'processType', 
               'estimated_price', 'sold_to', 'warranty', 'finished_processing', 'value', 'pallet_id', 'idproduct_type', 
               'type', 'idprocess', 'start_time', 'end_time', 'station_idstation', 'product_idproduct', 
               'process_types_idprocess_types', 'pass_fail', 'estimated_process_cost', 'box_id', 'idvisual_inspection', 
               'process_history_idprocess', 'grade', 'comments']
rawDataFrame = pd.DataFrame(list(rows), columns = columnNames)
## Step 5. Remove unnecesary columns.
rawDataFrame.drop(labels=['product_type_idproduct_type', 'box_idbox', 'serialnumber', 'decomm_id',
                         'customer_ref', 'asset_tag', 'estimated_price', 'pallet_id', 'idproduct_type',
                         'idproduct', 'idvisual_inspection', 'process_history_idprocess',
                         'estimated_process_cost'],
                 axis = 1, inplace = True)
## Step 6. Rename remaining columns.
rawDataFrame.rename(columns={'br_number_idbr_number':'batchNumber', 'manufacturer_idmanufacturer':'manufacturerId',
                            'model_idmodel':'modelId', 'processType':'processTypePath', 'sold_to':'soldTo',
                            'warranty':'warranty', 'finished_processing':'finishProcessing','value':'value',
                            'type':'productType','idprocess':'processId', 'start_time':'startTime', 'end_time':'endTime',
                            'station_idstation':'stationId', 'process_types_idprocess_types':'processType',
                            'pass_fail':'passFail','box_id':'boxId', 'grade':'grade', 'comments':'comments',
                             'product_idproduct':'productId'},
                   inplace = True)

In [5]:
## Step 7. Change the label column to categorical data.
rawDataFrame['grade'] = rawDataFrame['grade'].astype('category')
## Step 8. Distribution of grades per product type.
def gradeDistribution(gradeColumn):
    """
    This function gets the grade distribution for each grade type.

    Parameters
    ----------
    gradeColumn : List
        The grade for each product for which the distribution is calculated.

    Returns
    -------
    resultDataFrame : Dataframe
        Dataframe with the distribution of grades as a count and also as a percentage.
    """
    ## Step 1. Define the possible grade values.
    grades = ['C', 'B', 'A', 'Z', 'D', 'A*', 'E']
    ## Step 2. For each value get the number of times the grade is seen in numbers and in percentage.
    countGrades = []
    percentageGrades = []
    for grade in grades:
        temp = len(gradeColumn[gradeColumn == grade])
        countGrades.append(temp)
    totalProducts = sum(countGrades)
    percentageGrades = [round(gradeCount / totalProducts, 2) if totalProducts > 0 else 0 for gradeCount in countGrades]
    resultDataFrame = pd.DataFrame({'countGrades':countGrades, 'percentageGrades':percentageGrades}, index = grades)
    ## Step 3. Return as a dataframe.
    return resultDataFrame
dataFrameGrouped = rawDataFrame.groupby('productType')['grade']
productTypeGrades = dataFrameGrouped.apply(func=gradeDistribution)

In [6]:
## Step 9. Histogram of product type grades.
## Step 9.1 Bar plot.
output_notebook()
plotDataFrame = productTypeGrades.reset_index()
plotDataFrame.rename(columns={'level_1':'gradeType'}, inplace=True)
## Step 9.2. Add color information for each product type.
plotDataFrame.loc[plotDataFrame[plotDataFrame['productType'] == 'base station'].index, 'colourInformation'] = 'ligthcoral'
plotDataFrame.loc[plotDataFrame[plotDataFrame['productType'] == 'laptop'].index, 'colourInformation'] = 'seagreen'
plotDataFrame.loc[plotDataFrame[plotDataFrame['productType'] == 'mobile phone'].index, 'colourInformation'] = 'royalblue'
plotDataFrame.loc[plotDataFrame[plotDataFrame['productType'] == 'monitor'].index, 'colourInformation'] = 'orange'
plotDataFrame.loc[plotDataFrame[plotDataFrame['productType'] == 'printer'].index, 'colourInformation'] = 'crimson'
plotDataFrame.loc[plotDataFrame[plotDataFrame['productType'] == 'router'].index, 'colourInformation'] = 'khaki'
plotDataFrame.loc[plotDataFrame[plotDataFrame['productType'] == 'server'].index, 'colourInformation'] = 'grey'
## Step 9.3. Change the notation for the A* to A0.
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'A*'].index, 'gradeType'] = 'A0'
## Step 9.4. Setup data source.
figurePercentageGrades = Bar(data=plotDataFrame, label='productType', values='percentageGrades', stack='gradeType',
                            legend='top_right', width = 700, height=600, tools='hover, save')
figurePercentageGrades.title.text = 'Grade type'
figurePercentageGrades.xaxis.axis_label = 'Product'
figurePercentageGrades.yaxis.axis_label = 'Ratio'
figurePercentageGrades.xaxis.axis_label_text_font_size = '20pt'
figurePercentageGrades.yaxis.axis_label_text_font_size = '20pt'
figurePercentageGrades.title.align = 'center'
figurePercentageGrades.title.text_font_size = '16pt'
figurePercentageGrades.xaxis.major_label_text_font_size = '12pt'
figurePercentageGrades.yaxis.major_label_text_font_size = '12pt'
figurePercentageGrades.legend.orientation='horizontal'
figurePercentageGrades.y_range = Range1d(0, 1.2)
hover = figurePercentageGrades.select(dict(type=HoverTool))
hover.tooltips = [('Cumulative ratio','@y{1.11}'), ('Grade','@gradeType')]
show(figurePercentageGrades)

In [7]:
## SECTION 2. Which are the grades that end up being scrapped.
## Step 1. Retrieve information from database for products with process type equal to strip and scrap (8) or sold for destruction.
cursor = conn.cursor()
## Step 1.1. Query to the database.
cursor.execute(r"SELECT DISTINCT(idproduct) FROM product \
                 INNER JOIN process_history ON product.idproduct = process_history.product_idproduct \
                 WHERE product.sold_to LIKE '%destruction%' OR process_types_idprocess_types = 8;")
rows = cursor.fetchall()
connectionClosed = cursor.close()
## Step 2. Name columns.
columnNames = ['productId']
tempDataFrame = pd.DataFrame(list(rows), columns = columnNames)
## Step 3. Merge dataframes by productId.
tempDataFrame = pd.merge(rawDataFrame, tempDataFrame, on='productId')

In [8]:
## Step 4. Group by product.
dataFrameGrouped = tempDataFrame.groupby('productType')['grade']
productTypeGradesScrapped = dataFrameGrouped.apply(func=gradeDistribution)

In [9]:
## Step 5. Histogram of product type grades.
## Step 5.1 Bar plot.
plotDataFrame = productTypeGradesScrapped.reset_index()
plotDataFrame.rename(columns={'level_1':'gradeType'}, inplace=True)
## Step 5.2. Remove the data from grades Z or with no percentageGrades count.
plotDataFrame = plotDataFrame[(plotDataFrame['percentageGrades'] > 0.0) & (plotDataFrame['gradeType'] != 'Z')]
## Step 5.3. Add colour information to focus only on products with more than 5%.
plotDataFrame.loc[plotDataFrame[plotDataFrame['percentageGrades'] >= 0.05].index,'colourInformation'] = 'lightcoral'
plotDataFrame.loc[plotDataFrame[plotDataFrame['percentageGrades'] < 0.05].index,'colourInformation'] = 'grey'
## Step 5.3.1. Add colour information for each grade.
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'A0'].index,'colourGradeInformation'] = "green"
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'A'].index,'colourGradeInformation'] = "blue"
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'B'].index,'colourGradeInformation'] = "black"
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'C'].index,'colourGradeInformation'] = "red"
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'D'].index,'colourGradeInformation'] = "grey"
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'E'].index,'colourGradeInformation'] = "orange"
plotDataFrame.loc[plotDataFrame[plotDataFrame['gradeType'] == 'Z'].index,'colourGradeInformation'] = "khaki"
## Step 5.4. Add tooltip values.
plotDataFrame['percentageGradesValues'] = plotDataFrame['percentageGrades'].apply(lambda x:str(round(x * 100, 1)))
## Step 5.5. Set the data source.
# sourceData = ColumnDataSource(data=dict(x=plotDataFrame['productType'],
#                                        y=plotDataFrame['percentageGrades'],
#                                        fill_color=plotDataFrame['colourInformation'],
#                                        gradeType=plotDataFrame['gradeType'],
#                                        percentageGradesValues=plotDataFrame['percentageGradesValues']))
sourceData = ColumnDataSource(data=dict(x=plotDataFrame['productType'],
                                       y=plotDataFrame['percentageGrades'],
                                       fill_color=plotDataFrame['colourGradeInformation'],
                                       gradeType=plotDataFrame['gradeType'],
                                       percentageGradesValues=plotDataFrame['percentageGradesValues']))

hoverInfo = HoverTool(tooltips=[
    ('Grade', '@gradeType'),
    ('%Grade', '@percentageGradesValues')
])

figureScrappedPercentageGrades = figure(plot_height=600, plot_width=700, title="Products scrapped", tools='reset, box_select, save, box_zoom, pan',
             x_axis_label = "Product", y_axis_label = "Percentage", x_range=list(plotDataFrame['productType'].unique()))
figureScrappedPercentageGrades.yaxis[0].formatter = NumeralTickFormatter(format="0.0%")
figureScrappedPercentageGrades.xaxis.axis_label_text_font_size = '20pt'
figureScrappedPercentageGrades.yaxis.axis_label_text_font_size = '20pt'
figureScrappedPercentageGrades.title.align = 'center'
figureScrappedPercentageGrades.title.text_font_size = '16pt'
figureScrappedPercentageGrades.xaxis.major_label_text_font_size = '12pt'
figureScrappedPercentageGrades.yaxis.major_label_text_font_size = '12pt'
# figureScrappedPercentageGrades.circle(source=sourceData, x='x', y='y', fill_color='fill_color',
#                                     size=10, fill_alpha=0.6, line_color=None)
figureScrappedPercentageGrades.circle(source=sourceData, x='x', y='y', fill_color='fill_color',
                                    size=10, fill_alpha=0.6, line_color=None, legend='gradeType')
figureScrappedPercentageGrades.add_tools(hoverInfo)
show(figureScrappedPercentageGrades)

In [10]:
## SECTION 3. Where in the process are the products being removed from processing. 
## Why are the products being removed, what are the comments noted on them.
## Step 1. Remove the entries for products which have grade Z.
tempDataFrame = tempDataFrame[tempDataFrame['grade'] != 'Z']
productIds = tempDataFrame['productId'].values
inClauseIds = ",".join(map(str, productIds))
## Step 2. Retrieve process information for products of interest.
cursor = conn.cursor()
## Step 2.1. Query to the database.
cursor.execute(r"SELECT idprocess, end_time, product_idproduct, process_types_idprocess_types FROM process_history WHERE product_idproduct IN (%s) ORDER BY end_time DESC;" %inClauseIds)
rows = cursor.fetchall()
connectionClosed = cursor.close()
## Step 2.2 Name columns.
columnNames = ['processId', 'endTime', 'productId', 'processType']
tempDataFrameDetails = pd.DataFrame(list(rows), columns = columnNames)

In [11]:
## Step 3. Retrieve the last process for each product which is not process type 8.
def lastProcess(processDetailsDataFrame):
    """
    This function gets the last process for a product which before it was scrapped.

    Parameters
    ----------
    processDetailsDataFrame : Dataframe
        Process details dataframe for a product.

    Returns
    -------
    resultDataFrame : Dataframe
        Dataframe with the last process type for the product.
    """
    processDetailsDataFrame = processDetailsDataFrame.copy()
    processDetailsDataFrame = processDetailsDataFrame[processDetailsDataFrame['processType'] != 8]
    processDetailsDataFrame.sort_values(by=['endTime'], axis=0, ascending=False, inplace=True)
    processDetailsDataFrame.drop(labels=['processId', 'endTime'], axis=1, inplace=True)
    return processDetailsDataFrame.iloc[:1, :]

def ratioLastProcess(processSeries):
    """
    This function gets the ratio for each the last processes in the provided column.

    Parameters
    ----------
    processSeries : Series
        Last process series.

    Returns
    -------
    resultDataFrame : Dataframe
        Dataframe with the last process ratio.
    """
    ## Step 1. Get the count values. 
    countSeries = processSeries.value_counts(normalize=False).sort_values()
    countSeries.rename('count', inplace=True)
    ## Step 2. Get the normalised values.
    normalisedCountSeries = value_counts(normalize=True).sort_values()
    normalisedCountSeries.rename('ratio', inplace=True)
    ## Step 3. Create the resulting dataframe.
    resultsDataFrame = pd.concat([countSeries, normalisedCountSeries], axis=1)
    return resultsDataFrame    
    

## Step 3.1. Get last process for each product.
dataFrameGrouped = tempDataFrameDetails.groupby('productId')
lastProcessScrappedProducts = dataFrameGrouped.apply(func=lastProcess)
lastProcessScrappedProducts.reset_index(drop=True, inplace=True)
lastProcessScrappedProducts.rename(columns={'processType':'lastProcess'}, inplace=True)

## Step 3.2. Join the scrapped product dataframe and the last process for each product.
tempDataFrame = pd.merge(tempDataFrame, lastProcessScrappedProducts, on='productId')

In [12]:
# Step 3.3. Generate the last process count dataframe dataframe.
lastProcessDataframe = tempDataFrame[['productType', 'lastProcess']]
lastProcessDataframe = tempDataFrame.groupby(by='productType')['lastProcess']
## Step 3.4 Get the last process count.
lastProcessDataframe = lastProcessDataframe.value_counts(normalize=False)
lastProcessDataframe = lastProcessDataframe.reset_index(level = 'productType')
lastProcessDataframe.rename(columns={'lastProcess':'count'}, inplace=True)
lastProcessDataframe.reset_index(inplace=True)
lastProcessDataframe.sort_values(by=['count'], inplace=True)
## Step 3.5. Add plot details to dataframe.
plotDataFrame = lastProcessDataframe.copy()
SIZES = list(range(8, 3 * len(set(plotDataFrame['count'])) + 8, 3))
COLORS = Spectral5
tempList = list(set(plotDataFrame['count']))
tempList.sort()
groupsCount = pd.qcut(tempList, len(COLORS))
coloursAvailable = [COLORS[xx] for xx in groupsCount.codes]

for details in zip(tempList, SIZES):
    plotDataFrame.loc[plotDataFrame[plotDataFrame['count'] == details[0]].index,'sizeInformation'] = details[1]
for details in zip(tempList, coloursAvailable):
    plotDataFrame.loc[plotDataFrame[plotDataFrame['count'] == details[0]].index,'colourInformation'] = details[1]
    
tempList = list(plotDataFrame['lastProcess'].unique())
tempList.sort()
tempList = list(map(lambda x:str(x), tempList))

plotDataFrame['lastProcess'] = plotDataFrame['lastProcess'].apply(lambda x:str(x))
    
## Step 3.56. Set the data source.
sourceData = ColumnDataSource(data=dict(x=plotDataFrame['lastProcess'],
                                       y=plotDataFrame['productType'],
                                       fill_color=plotDataFrame['colourInformation'],
                                       size=plotDataFrame['sizeInformation'],
                                       count=plotDataFrame['count']
                                       ))

hoverCount = HoverTool(tooltips=[
    ('Count', '@count')
])

figureLastProcess = figure(plot_height=600, plot_width=700, title="Last process before scrapping", tools='reset, box_select, save, box_zoom, pan',
             x_axis_label = "Last process", y_axis_label = "Product type", x_range = tempList,
                          y_range=list(plotDataFrame['productType'].unique()), background_fill_color = '#2F2F2F')
figureLastProcess.xaxis.axis_label_text_font_size = '20pt'
figureLastProcess.yaxis.axis_label_text_font_size = '20pt'
figureLastProcess.title.align = 'center'
figureLastProcess.title.text_font_size = '16pt'
figureLastProcess.xaxis.major_label_text_font_size = '12pt'
figureLastProcess.yaxis.major_label_text_font_size = '12pt'
figureLastProcess.xgrid.grid_line_dash = [6, 4]
figureLastProcess.ygrid.grid_line_dash = [6, 4]
figureLastProcess.circle(source=sourceData, x='x', y='y', fill_color='fill_color',
                            size='size', fill_alpha=0.9, line_color='white', hover_color='white', hover_alpha=0.5)
figureLastProcess.add_tools(hoverCount)
show(figureLastProcess)


In [13]:
## SECTION 3.1. Focus on comments for process 7. (NEEDS CHECKING)
# commentsDataFrame = tempDataFrame[tempDataFrame['lastProcess'] == 7][['productId', 'processId']]
# processIds = commentsDataFrame['processId'].values
# inClauseIds = ",".join(map(str, processIds))
# ## Step 3.1.1. Retrieve reason for fail for products of interest.
# cursor = conn.cursor()
# ## Step 3.1.2. Query to the database.
# cursor.execute(r"SELECT process_history_idprocess, reason_for_fail FROM functional_test WHERE process_history_idprocess IN (%s);" %inClauseIds)
# rows = cursor.fetchall()
# connectionClosed = cursor.close()
# rows

In [14]:
class DictEncoderSingleValues(base.BaseEstimator, base.TransformerMixin):
    """
    This class is used before applying the dict vectoriser for one hot encoding.
    It should be used for information with a single value on each entry otherwise
    the transform method will fail.
    """
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # X will come in as a list.  Return a list of
        # dictionaries corresponding to observed single id. Each element in the list is an entry.
        ## Step 1. Move accross the outermost list. Use index 0 to retrieve the list with category information.
        resultsList = []
        for item in X:
            resultDictionary = defaultdict(int)
            resultDictionary[item] = 1
            resultsList.append(resultDictionary)
        return resultsList

In [15]:
## SECTION 4. Based on the model, the grade given, the product type and the comment will the product be scrapped.
## Step 1. Create the ml dataframe.
## Value 1 on final represents scrapped.
mlDataframe = rawDataFrame.copy()[['manufacturerId', 'modelId', 'productType', 'grade', 'productId']]
mlDataframe['final'] = 0
mlDataframe.loc[mlDataframe['productId'].isin(tempDataFrame['productId'].values), 'final'] = 1
results = mlDataframe['final'].values
## Step 2. Remove the output column.
mlDataframe.drop(axis=1, labels=['productId'], inplace=True)
mlDataframe.drop(axis=1, labels=['final'], inplace=True)
## Step 3. Hot one encoding.
mlDataframe['manufacturerId'] = mlDataframe['manufacturerId'].apply(lambda x:str(x))
mlDataframe['modelId'] = mlDataframe['modelId'].apply(lambda x:str(x))
tempDictEncoder = DictEncoderSingleValues()
dictVect = DictVectorizer(sparse=False)
manufacturerHotOneEncoding = tempDictEncoder.transform(X=mlDataframe['manufacturerId'])
manufacturerHotOneEncoding = dictVect.fit_transform(manufacturerHotOneEncoding)
modelIdHotOneEncoding = tempDictEncoder.transform(X=mlDataframe['modelId'])
modelIdHotOneEncoding = dictVect.fit_transform(modelIdHotOneEncoding)
productTypeHotOneEncoding = tempDictEncoder.transform(X=mlDataframe['productType'])
productTypeHotOneEncoding = dictVect.fit_transform(productTypeHotOneEncoding)
gradeHotOneEncoding = tempDictEncoder.transform(X=mlDataframe['grade'])
gradeHotOneEncoding = dictVect.fit_transform(gradeHotOneEncoding)
mlDataFrameEncoded = np.concatenate((manufacturerHotOneEncoding, modelIdHotOneEncoding, productTypeHotOneEncoding, gradeHotOneEncoding), axis = 1)

In [16]:
## Step 4. Decision tree model.
## Step 4.1. Split data for train and test subsets. Shuffle data to prevent capturing products from bad batches.
indices = np.random.permutation(range(len(mlDataFrameEncoded))) ## Random permutation of numbers to shuffle the data.
dataRandomOrder, resultsRandomOrder = mlDataFrameEncoded[indices], results[indices]
## Step 4.2. Fit model with decision tree using grid search.
X_train, X_test, y_train, y_test = train_test_split(dataRandomOrder, resultsRandomOrder, test_size = 0.3, random_state=42)
clf = tree.DecisionTreeClassifier()
parameters = dict(max_depth=range(1,10), max_features=range(1, 10))
modelScrapped = GridSearchCV(estimator=clf, param_grid=parameters, cv=5, n_jobs=2, scoring='neg_mean_squared_error')
modelScrapped.fit(X_train, y_train)
# clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'max_depth': range(1, 10), 'max_features': range(1, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [20]:
## Step 4.3. Predict and get metrics.
print(modelScrapped.best_params_)
y_pred = modelScrapped.predict(X_test)
# y_pred = clf.predict(X_test)
targetNames = ['Not scrapped', 'Scrapped']
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=targetNames))
print(round(accuracy_score(y_true=y_test, y_pred=y_pred), 2))

{'max_depth': 8, 'max_features': 4}
              precision    recall  f1-score   support

Not scrapped       0.96      1.00      0.98      9426
    Scrapped       0.00      0.00      0.00       421

 avg / total       0.92      0.96      0.94      9847

0.96


  'precision', 'predicted', average, warn_for)


In [21]:
## Step 4.4. Visualisation of the tree.
## Step 4.1. Name of columns.
featureNames = []

def featureNamesIndex(column):
    tempList = []
    for item in column:
        if item not in tempList:
            tempList.append(item)
    return tempList
        
featureNames.append(featureNamesIndex(mlDataframe['manufacturerId']))
featureNames.append(featureNamesIndex(mlDataframe['modelId']))
featureNames.append(featureNamesIndex(mlDataframe['productType']))
featureNames.append(featureNamesIndex(mlDataframe['grade']))
featureNames = [y for x in featureNames for y in x]

## Step 4.2. Graph the tree.
dot_data = export_graphviz(decision_tree=modelScrapped.best_estimator_, feature_names=featureNames, filled = True, rounded = True,
               out_file = 'tree.dot')