In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import itertools
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import fbeta_score, accuracy_score, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

import visuals as vs
import time

In [2]:
df = pd.read_csv('Dataframe raw.csv', sep = ';')
df_final = pd.read_csv('DataFrame.csv', sep = ';', index_col = 0)

In [3]:
#separation of input, middle and output for the full model

df_input = df_final.iloc[:,1:16].drop(['ES Service'], axis= 1)
df_middle = df_final.iloc[:,16:]

added_to_second = ['DS Discover', 'DS Define','DS Develop', 'DS Deliver', 'ES Product', 'Team size min',
                   'Part Users', 'Part Experts', 'Part Service staff', 'Part Stakeholders']
df_middle_second = pd.concat([df_middle, df_final[added_to_second]], axis = 1)

mapping_names = {name['Name']: index for index, name in df.iterrows()}
df_output = df_final['Name'].map(mapping_names)

# My Model

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import time

correct_sum = 0
iterations = 100
i = 0

while i < iterations:
    try:
        #first part
        estimators = {}

        for column in df_middle:
            X_train_first, X_test_first, y_train_first, y_test_first = train_test_split(df_input, df_middle[column],
                                                                                    test_size = 0.25, random_state = int(time.time()))

            estimator = GradientBoostingRegressor(learning_rate = 0.4, n_estimators = 25, max_depth = 5)
            estimators.update({column: estimator.fit(X_train_first,y_train_first)})

        predicted_values_first = pd.DataFrame(index = X_test_first.index.values, columns = df_middle.columns.values)

        for _, row in X_test_first.iterrows():
            for feature in df_middle.columns.values:
                new_case = pd.DataFrame([np.array(row)], columns = df_input.columns.values)
                predicted_values_first[feature].loc[row.name] = estimators[feature].predict(new_case)

        #second part
        X_train_second = df_middle_second.iloc[X_train_first.index.values]
        y_train_second = df_output.iloc[y_train_first.index.values]
        X_test_second = pd.concat([predicted_values_first, X_test_first[added_to_second]], axis = 1)
        y_test_second = df_output.iloc[y_test_first.index.values]

        model = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', C = 0.7)

        model.fit(X_train_second, y_train_second)

        predicted_values_second = pd.DataFrame(index = X_test_second.index.values, columns = df['Name'])

        for _, row in X_test_second.iterrows():
            new_case = pd.DataFrame([np.array(row)], columns = X_test_second.columns.values)
            predicted_values_second.loc[row.name] = model.predict_proba(new_case)

        '''for j, row in X_test_first.iterrows():
            design_step = row[:4].idxmax(axis=1)
            predicted_values_second.loc[j,:][df['Name'][df.loc[:,design_step]!=0].unique()] *= 1.25'''

        results = pd.DataFrame(index = X_test_second.index.values, columns = ['first', 'proba', 'second', 'proba',
                                                                              'third', 'proba'])
        for _, row in predicted_values_second.iterrows():
            best_three = np.array(sorted(zip(row, df['Name']), reverse=True)[:3])[:,1]
            results.loc[row.name] = [best_three[0], predicted_values_second.loc[row.name, best_three[0]],
                                     best_three[1], predicted_values_second.loc[row.name, best_three[1]],
                                     best_three[2], predicted_values_second.loc[row.name, best_three[2]]]

        remapping_names = {index: name['Name'] for index, name in df.iterrows()}
        y_check = y_test_second.map(remapping_names)

        results['check'] = np.nan

        for index in y_check.index.values:
            results.loc[index, 'check'] = y_check.loc[index] in np.array(results.loc[index])

        results['true'] = y_check
        correct_sum += results['check'].sum()/results.shape[0]
        i += 1
    except:
        None
    print('\r {} / {}'.format(i, iterations), end='')
        
correct_sum/iterations

 100 / 100

0.9838518518518515

In [5]:
results.iloc[100:]

Unnamed: 0,first,proba,second,proba.1,third,proba.2,check,true
161,Mind Map,0.54719,5 Whys,0.0954769,Contextual Interview,0.0586036,True,Mind Map
114,Functional Analysis,0.575557,Mind Map,0.107529,Brainwriting,0.0827025,True,Functional Analysis
48,Storyboard,0.791626,How Might We,0.0280784,Blueprint,0.0271086,True,Storyboard
524,Alpha Prototyping,0.840303,Storyboard,0.0472827,Rough Prototyping,0.0410314,True,Alpha Prototyping
107,Affinity Diagram,0.511415,Dot Voting,0.143054,How Might We,0.0621829,True,Affinity Diagram
82,Mind Map,0.585985,5 Whys,0.0828303,Impact Matrix,0.0737962,True,Mind Map
475,Functional Analysis,0.807802,How Might We,0.0470245,Traditional Brainstorming,0.0273487,True,Functional Analysis
149,Mind Map,0.369347,Reverse Brainstorming,0.126656,Rough Prototyping,0.110576,True,Rough Prototyping
503,Alpha Prototyping,0.848729,Rough Prototyping,0.0427808,Storyboard,0.0399857,True,Alpha Prototyping
46,Mind Map,0.594098,5 Whys,0.082346,Impact Matrix,0.0693876,True,Mind Map


In [6]:
#check rows that contain probas below tolerance of 0.01
results[results.iloc[:,5] < 0.01].count()[0]

2

In [7]:
#check Proba Sum Mean
total = []
total.append([row.sum() for _, row in results['proba'].iterrows()])
np.mean(total)

0.7953652684438636

# random

In [8]:
from numpy.random import choice

correct_sum = 0
iterations = 100
weight = np.array(df_final.groupby(['Name'])['Name'].count()/df_final['Name'].count())

for i in range(iterations):

    random_results = pd.DataFrame(index = y_test_second.index.values, columns = ['first', 'second', 'third'])
    for i, row in random_results.iterrows():
        random_results.loc[i] = draw = choice(df['Name'], 3, p = weight)

    random_results['check'] = np.nan

    for index in y_check.index.values:
        random_results.loc[index, 'check'] = y_check.loc[index] in np.array(random_results.loc[index])

    random_results['true'] = y_check
    correct_sum += random_results['check'].sum()/random_results.shape[0]
    
correct_sum/iterations

0.17562962962962964

# decision tree
100% correct, but without finding other 2 adequate CITs. This model would be perfect if I wanted only THE best CIT, but I want the 3 best ones

In [9]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

correct_sum = 0
iterations = 100

for i in range(iterations):
    try:
    #first part
        estimators = {}

        for column in df_middle:
            X_train_first, X_test_first, y_train_first, y_test_first = train_test_split(df_input, df_middle[column],
                                                                                    test_size = 0.25, random_state = int(time.time()))

            estimator = DecisionTreeRegressor()
            estimators.update({column: estimator.fit(X_train_first,y_train_first)})

        predicted_values_first = pd.DataFrame(index = X_test_first.index.values, columns = df_middle.columns.values)

        for _, row in X_test_first.iterrows():
            for feature in df_middle.columns.values:
                new_case = pd.DataFrame([np.array(row)], columns = df_input.columns.values)
                predicted_values_first[feature].loc[row.name] = estimators[feature].predict(new_case)

        #second part
        X_train_second = df_middle_second.iloc[X_train_first.index.values]
        y_train_second = df_output.iloc[y_train_first.index.values]
        X_test_second = pd.concat([predicted_values_first, X_test_first[added_to_second]], axis = 1)
        y_test_second = df_output.iloc[y_test_first.index.values]

        model = DecisionTreeClassifier()

        model.fit(X_train_second, y_train_second)

        predicted_values_second = pd.DataFrame(index = X_test_second.index.values, columns = df['Name'])

        for _, row in X_test_second.iterrows():
            new_case = pd.DataFrame([np.array(row)], columns = X_test_second.columns.values)
            predicted_values_second.loc[row.name] = model.predict_proba(new_case)

        results_DT = pd.DataFrame(index = X_test_second.index.values, columns = ['first', 'proba', 'second', 'proba',
                                                                                             'third', 'proba'])
        for _, row in predicted_values_second.iterrows():
            best_three = np.array(sorted(zip(row, df['Name']), reverse=True)[:3])[:,1]
            results_DT.loc[row.name] = [best_three[0], predicted_values_second.loc[row.name, best_three[0]],
                                     best_three[1], predicted_values_second.loc[row.name, best_three[1]],
                                     best_three[2], predicted_values_second.loc[row.name, best_three[2]]]

        remapping_names = {index: name['Name'] for index, name in df.iterrows()}
        y_check = y_test_second.map(remapping_names)


        results_DT['check'] = np.nan

        for index in y_check.index.values:
            results_DT.loc[index, 'check'] = y_check.loc[index] in np.array(results_DT.loc[index])

        results_DT['true'] = y_check
        correct_sum += results_DT['check'].sum()/results_DT.shape[0]
    except:
        iterations -= 1
        
correct_sum/iterations

0.9220351664796115