In [613]:
import math
import matplotlib.pyplot as plt
import pandas as pd

In [614]:
food_companies = ["veganjunkfoodbar", "pastaebasta_amsterdam", "mamakellyamsterdam", "watsonsfood",
"cannibaleroyale", "parkheuvel", "restaurantfred", "hugh_rotterdam", "oldscuola", "restaurantkite",
"wturbankitchen", "thestreetfoodclub", "rumclubutrecht", "lejardinutrecht", "broei.utrecht"]

furniture_companies = ["madedotcom", "vtwonen", "hemanederland", "loods5", "ikeanederland", "homify", "westwingnl", 
"karwei", "kwantum_nederland", "xenos_nl", "homedeco", "bol_com", "leenbakker", "wonenmetlef", "_connox_",
"interiorjunkiecom", "jysknl", "wehkamp", "fonqnl", "konforhome", "basiclabel.nl", "blokker",
"deensnl", "hastensbeds", "eijerkamp", "goossenswonenenslapen", "furn.nl", "stoermetaal", "roomednl", "misterdesignnl",
"dekbeddiscounter", "woonexpress", "zitmaxx", "pronto_wonen", "designbestseller", "barbecueshop.nl",
"flinders.design", "trendhopper", "debommelmeubelen", "otto_nl", "praxis_bouwmarkt", "gamma_nl",
"pietklerkx.nl", "swisssense", "montelwonen", "aupingnl", "hacowonenenslapen", "emma_matras", "hornbachnl",
"lampenlicht.nl", "profijtmeubel", "bianonl", "woonboulevardpoortvliet", "morreswonen", "hubo_nl", "beter_bed",
"hoogenboezem.meubelen", "villajipp_outlet", "vidaxl_nl", "mline_nl"]

sport_companies = ["plutosport.nl", "voetbalshopnl", "all4runningstore", "voetbaldirect",
"dakasport", "hockeydirect.nl", "tennisdirect", "intersportnl", "aktiesport.nl", "sport2000nederland",
"soccerfanshop", "jdsportsnl", "decathlonnederland", "gorillasportsnl", "perrysport.nl"]

company_names = food_companies + furniture_companies + sport_companies

# 6.1 Create/load the ground truths

## 6.1.1 Interview ground standard

In [617]:
gt_interviews = {
            'aupingnl' : {'ikeanederland' : 3,
                       'swisssense': 3,
                       'hastensbeds' : 2,
                       'beter_bed' : 2,
                       'mline_nl' : 2,
                       'emma_matras' : 1
                        },
            'fonqnl' : {'ikeanederland' : 3,
                       'bol_com' : 3,
                       'wehkamp' : 3,
                       'leenbakker' : 2,
                       'madedotcom' : 2,
                       'kwantum_nederland' : 2,
                       'blokker' : 1,
                       'hemanederland' : 1,
                       'xenos_nl' : 1,
                       'westwingnl' : 1,
                       'flinders.design' : 1
                       },
            'swisssense' : {'aupingnl' : 3,
                       'beter_bed' : 3,
                       'goossenswonenenslapen' : 3,
                       'ikeanederland' : 2,
                       'leenbakker' : 2,
                       'jysknl' : 2,
                       'kwantum_nederland' : 1
                        },
     'flinders.design' : {'misterdesignnl' : 3,
                          'fonqnl' : 4,
                       '_connox_' : 3,
                        'deensnl' : 3,
                       'designbestseller' : 2,
                        'trendhopper' : 2,
                        'interiorjunkiecom' : 2,
                       'ikeanederland' : 2
                        },
        'leenbakker' : {'ikeanederland' : 2,
                       'kwantum_nederland' : 2,
                       'fonqnl' : 1,
                       'jysknl' : 2,
                       'wehkamp' : 1,
                       'beter_bed' : 1,
                       'bol_com' : 1,
                       'dekbeddiscounter' : 1
                       },
            'karwei' : {'gamma_nl' : 5,
                       'praxis_bouwmarkt' : 4,
                       'ikeanederland' : 4,
                       'hornbachnl' : 3,
                       'kwantum_nederland' : 3,
                       'leenbakker' : 3,
                        'jysknl': 3,
                        'hubo_nl' : 2,
                        'loods5': 1
                       },
            'loods5' : {'ikeanederland' : 1,
                       'kwantum_nederland' : 1,
                       'leenbakker' : 1,
                       'bol_com' : 1,
                       'wehkamp' : 1,
                       'fonqnl' : 1,
                       'madedotcom' : 1,
                       },
    'debommelmeubelen' : {'hacowonenenslapen' : 2,
                       'goossenswonenenslapen' : 3,
                       'zitmaxx' : 3,
                        'eijerkamp' : 2,
                        'flinders.design' : 2,
                        'pietklerkx.nl' : 2
                        },
'goossenswonenenslapen' : {'fonqnl' : 3,
                       'pietklerkx.nl' : 2,
                       'eijerkamp' : 2,
                       'vtwonen' : 1,
                       'madedotcom' : 1,
                       'loods5' : 1,
                       'ikeanederland' : 1,
                       'swisssense' : 1,
                       'bol_com' : 1,
                       'konforhome' : 1,
                       'zitmaxx' : 1,
                       'flinders.design' : 1
                        }}

#Transform answers to a list
gt_interviews_list = {}

for company in gt_interviews:
    gt_interviews_list[company] = [company for company in gt_interviews[company]]

## 6.1.3 Create SBI ground truth.

In [6]:
df_sbi = pd.read_csv('../remaining_info/companies_SBI_overview.csv', sep=";")[0:90]

In [7]:
def extract_sbicodes(companyName, df):
    df = df.loc[df['DisplayName'] == companyName]
    sbi =  df[['SBI_1', 'SBI_2', 'SBI_3', 'SBI_4', 'SBI_5', 'SBI_6', 'SBI_7', 'SBI_8']]
    if len(sbi) == 1 : sbi_values =  sbi.values.tolist()[0]
    else : print('somehow we had multiple lines with only one company name.')
    return [value for value in sbi_values if math.isnan(value) == False]


company_sbi_dict = {}

for company in company_names:    
    company_sbi_dict[company] = extract_sbicodes(company, df_sbi)

In [8]:
gt_sbicodes = {}

for main_company in company_names:
    similar_sbi = []

    current_company_sbi = company_sbi_dict[main_company]
    other_companies = [value for value in company_sbi_dict.keys() if value != main_company]

    for company in other_companies:
        for SBI in company_sbi_dict[company]:
            if SBI in current_company_sbi:
                similar_sbi.append(company)
            
    gt_sbicodes[main_company] = [value for value in set(similar_sbi)]

## 6.2 Load results of one specific vector space

In [9]:
def load_distance_matrix(data_input, distance_algorithm, post_level, user_level, IDF_penalty):
    '''Based on the values provided it extracts the given distance matrix from the results folder. It transforms this pandas
    matrix into a numpy array which is an inner list containing all distance scores between all pairs of companies (90x90)'''
    if IDF_penalty == 'yes': idf = '-TFIDF'
    else: idf = ''

    df_simalarityScores = pd.read_pickle('../data/results/SS_{}_{}_P{}_U{}{}.pkl'.format(data_input, distance_algorithm, post_level, user_level, idf))
    company_names = [value for value in df_simalarityScores.index]
    return df_simalarityScores.as_matrix()

In [10]:
# HIER MAAK IN EEN INNER DICTIONARY. HIERDOOR KUN JE DE AFSTAND TUSSEN TWEE BEDRIJVEN VINDEN MET:
# values_dict[COMPANY_1][COMPANY_2]
def obtain_research_gt(similarityValues, company_names=company_names):
    """Takes the numpy array created with the 'load_distance_matrix' as input. It transforms this inner list to 
    an inner dict so we can simply obtain a distance metric between two companies."""
    own_method = {}
    
    lijst_index = 0
    for lijst in similarityValues:
        lijst_dict = {}
        company_index = 0

        for value in lijst:
            lijst_dict[company_names[company_index]] = similarityValues[lijst_index][company_index]    
            company_index += 1

        own_method[company_names[lijst_index]] = lijst_dict
        lijst_index += 1
    return own_method

## 6.3 Evaluate results with either SBI or Interview gold standard

### 6.3.1 Calculate precision, recall for all companies combined with threshold 0.01 until 1.0

In [11]:
def ownmethod_competitors(company, threshold, method_results, similarityMetric):
    if similarityMetric in ['cosine', 'jaccard']:
        above_threshold = [key for key in method_results if method_results[company][key] > threshold]
    elif similarityMetric in ['euclidean']:
        above_threshold = [key for key in method_results if method_results[company][key] < threshold]
    return [value for value in above_threshold if value != company]   

In [12]:
def calculate_metrics(own_results, ground_truth, distance_metric):
    '''The own_results are the companies identified as competitors by my own created tool. These are obtained
    with the ownmethod_competitors function. By comparing this with a ground truth we can obtain the metrics.'''
    TP = 0
    FP = 0
    FN = 0
    #We did not identify any companies as competitors
    if len(own_results) == 0:
        return {'precision' : 1.0, 'recall' : 0.0, 'TP' : 0, 'FP': 0, 'FN' : 0}
    else:
        for company in own_results:
            
            if company in ground_truth:
                TP += 1
            else:
                FP += 1
        for company in ground_truth:
            if company not in own_results:
                FN += 1
    precision = TP / (TP + FP)
    if FN == 0:
        recall = 1
    else:
        recall = TP / (TP + FN)
    
    return {'precision' : precision, 'recall' : recall, 'TP' : TP, 'FP' : FP, 'FN' : FN}

In [13]:
def calc_f1_score(recall,precision):
    """The f1 score is the Harmonic meanof Precision and Recall.""" 
    if recall == 0:
        return 0
    if precision == 0:
        return 0
    return 2 / ((1/recall) + (1/precision))

In [249]:
def confusion_matrix(own_results, ground_truth, all_companies):
    '''The own_results are the companies identified as competitors by my own created tool. These are obtained
    with the ownmethod_competitors function. By comparing this with a ground truth we can obtain the metrics.'''
    TP = 0
    FP = 0
    FN = 0
    TN = 0

    for company in all_companies:
        if company not in ground_truth:
            if company not in own_results:
                TN += 1
            if company in own_results:
                FP += 1
        if company in ground_truth:
            if company in own_results:
                TP += 1
            if company not in own_results:
                FN += 1
        
    return {'TP' : TP, 'FP' : FP, 'FN' : FN, 'TN' : TN}

In [16]:
def calculate_metrics(dict_values):
    '''This function takes as input a dictionary with FP, TN, TP and FN values. It returns the recall,
    precision and f1 score for that set of values.'''
    try:
        precision = dict_values['TP'] / (dict_values['TP'] + dict_values['FP'])
    except ZeroDivisionError:
        if dict_values['TP'] + dict_values['FP'] == 0:
            precision = 1.0
        else:
            precision = 0.0
    
    try:
        recall = dict_values['TP'] / (dict_values['TP'] + dict_values['FN'])
    except ZeroDivisionError:
        recall = 0.0
    
    try:
        f1_score = calc_f1_score(recall, precision)
    except ZeroDivisionError:
        f1_score = 0.0
        
    return {'precision' : precision, 'recall' : recall, 'f1_score' : f1_score}
    

In [580]:
def overall_metrics_all_companies(company_names, threshold, gt_research, ground_truth, similarityMetric, debugging):
    """Give this function one specific threshold and it calculates the average precision and recall for all companies
    combined. This function is used to find the best threshold for all companies in general."""
    confusion_values_overall = {'TP':0, 'FP':0, 'FN':0, 'TN':0}
    # Define the set of all companies
    #if len([company for company in ground_truth]) == 9:
    #    company_names = [company for company in ground_truth]
    
    for company in company_names:
        #Define what the competitors of this business are at the given threshold
        own_results = ownmethod_competitors(company, threshold, gt_research, similarityMetric)
        groundtruth = [value for value in ground_truth[company]]
        
        if debugging != None:
            print(threshold)
            print(own_results)
            print('')
            print(groundtruth)
            print('')
        
        #Create the confusion matrix compared to the ground_truth is for this threshold
        confusion_values = confusion_matrix(own_results, groundtruth, [value for value in gt_research])
        #Add them all up so we can create a confusion matrix for all companies
        confusion_values_overall['TP'] += confusion_values['TP']
        confusion_values_overall['FP'] += confusion_values['FP']
        confusion_values_overall['FN'] += confusion_values['FN']
        confusion_values_overall['TN'] += confusion_values['TN']
    
        if debugging != None:
            print(confusion_values)
            print('')
            print('')
            print('')
    return calculate_metrics(confusion_values_overall)

        

def performance_thresholds_all_companies(company_names, gt_research, ground_truth, similarityMetric, debugging):
    """Here we basically use the ovarall_metrics_all_companies function ranging with a threshold from 0.01 until 1
    with steps of 0,01. It calculates the metrics for all these thresholds."""
    overall_statistics = []
    for i in range(1,100):
        threshold = i/100
    
        metrics = overall_metrics_all_companies(company_names, threshold, gt_research, ground_truth, similarityMetric, debugging)    
        overall_statistics.append([threshold,
                                  metrics['recall'],
                                  metrics['precision'],
                                  metrics['f1_score']])
                              
    return pd.DataFrame(overall_statistics, columns=['threshold', 'recall', 'precision', 'f1_score'])

In [18]:
def choose_best_threshold(model_stats):
    """This function uses the input of the performance threshholds function to give as result which 
    threshold for the given company resulted in the best f1 score."""
    index_best_f1score = model_stats['f1_score'].argmax()
    return dict(model_stats.iloc[index_best_f1score])

# Using all code above to find best threshold & performance of given similarity matrix


In [None]:
#Pandas keeps giving warnings because a feature is going to be replaced soon, this codes ignores the warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [448]:
#Combining all code above

data_input = 'hashtag'
distance_algorithm = 'cosine'
post_level = 'relative'
user_level = 'absolute'
IDF_penalty = 'yes'
if IDF_penalty == 'yes': idf = '-TFIDF'
else: idf = ''

#### CHANGE THE GT_INTERVIEWS TO gt_sbicodes TO EVALUATE DISTANCE MATRIX WITH SBI GOLD STANDARD.
gt_research = obtain_research_gt(load_distance_matrix(data_input, distance_algorithm, post_level, user_level, IDF_penalty))
all_comps = performance_thresholds_all_companies(company_names, gt_research, gt_interviews, distance_algorithm, None)
choose_best_threshold(all_comps)
gt

{'threshold': 0.043,
 'recall': 0.4189189189189189,
 'precision': 0.2767857142857143,
 'f1_score': 0.3333333333333333}

### Functions to apply K-fold cross validation. Model is trained with 70% of data, evaluation scores obtained by testing model on the test dataset.

In [863]:
import random

def crossValidationSBI(data_input, distance_algorithm, post_level, user_level, idf, groundtruth = gt_sbicodes):
    if IDF_penalty == 'yes': idf = '-TFIDF'
    else: idf = ''
    cross_validation_performances = {}
        
    gt_research = obtain_research_gt(load_distance_matrix(data_input, distance_algorithm, post_level, user_level, IDF_penalty))
    
    for i in range(100):
        company_names = [value for value in groundtruth]
        train_portion = 0.7
        trainset = random.sample(company_names, int(len(company_names) * train_portion))
        testset = [value for value in company_names if value not in trainset]

        #Here we create a model which is overfitting the traindataset
        performance_trainset = performance_thresholds_all_companies(trainset, gt_research, groundtruth, distance_algorithm, None)
        metrics_trainset = choose_best_threshold(performance_trainset)
        # This was the threshold best at overfitting the train dataset
        selected_threshold = metrics_trainset['threshold']

        ## Lets now analyse what the performance of this model is on the test dataset
        final_performance = overall_metrics_all_companies(testset, selected_threshold, gt_research, groundtruth, distance_algorithm, None)
        final_performance['threshold'] = selected_threshold
        cross_validation_performances[i] = final_performance
    
    return pd.DataFrame(cross_validation_performances).transpose()



In [377]:
def crossValidationInterview(data_input, distance_algorithm, post_level, user_level, IDF_penalty, groundtruth = gt_interviews):
    if IDF_penalty == 'yes': idf = '-TFIDF'
    else: idf = ''
    cross_validation_performances = {}
    
    #Distance metrics calcualted by my study    
    gt_research = obtain_research_gt(load_distance_matrix(data_input, distance_algorithm, post_level, user_level, IDF_penalty))
    
    company_names = [value for value in groundtruth]
    
    for i in range(len(company_names)):
        trainset = [comp for comp in company_names if comp != company_names[i]]
        testset = [value for value in company_names if value not in trainset]

        performance_trainset = performance_thresholds_all_companies(trainset, gt_research, groundtruth, distance_algorithm, None)
        metrics_trainset = choose_best_threshold(performance_trainset)
    
        selected_threshold = metrics_trainset['threshold']
        final_performance = overall_metrics_all_companies(testset, selected_threshold, gt_research, groundtruth, distance_algorithm, None)
        final_performance['threshold'] = selected_threshold
        cross_validation_performances[i] = final_performance
        
    
    return pd.DataFrame(cross_validation_performances).transpose().describe()


In [855]:
crossValidationSBI('hashtag', 'euclidean', 'relative', 'absolute', 'yes')

Unnamed: 0,f1_score,precision,recall,threshold
count,100.0,100.0,100.0,100.0
mean,0.263541,0.168675,0.741892,0.0598
std,0.028016,0.022556,0.284489,0.038586
min,0.196787,0.126404,0.271868,0.02
25%,0.243631,0.154042,0.409685,0.02
50%,0.264016,0.165861,0.819251,0.03
75%,0.284337,0.179917,1.0,0.1
max,0.330318,0.241433,1.0,0.1


### Createe Baseline models (SBI)

In [604]:
# * Ran evaluation of randomized model 100 times. Each run was done with 10-fold cross validation.times. 

%%time
baseline_model = pd.DataFrame()

for i in range(100):
    run_df = crossValidationSBI()
    baseline_model = pd.concat([baseline_model, run_df], ignore_index=True, sort=True)
    

SBI_baselineModel

Unnamed: 0,f1_score,precision,recall,threshold
count,1000.0,1000.0,1000.0,1000.0
mean,0.273367,0.159553,0.968327,0.0313
std,0.029505,0.020036,0.037792,0.035485
min,0.176991,0.097087,0.648276,0.01
25%,0.255519,0.147002,0.960277,0.01
50%,0.273333,0.15892,0.981503,0.02
75%,0.29232,0.172123,0.990178,0.04
max,0.365288,0.225124,1.0,0.32


### 6.3.2 Visualize how the recall and precision varies, show best threshold.

In [None]:
def plot_df_metrics(dataframe_metrics):
    model_stats = dataframe_metrics.to_dict()


    x1 = [model_stats['threshold'][value] for value in model_stats['threshold']]
    y1 = [model_stats['precision'][value] for value in model_stats['precision']]

    # plotting the line 1 points 
    plt.plot(x1, y1, label = "precision")
    # line 2 points
    x2 = [model_stats['threshold'][value] for value in model_stats['threshold']]
    y2 = [model_stats['recall'][value] for value in model_stats['recall']]
    # plotting the line 2 points 
    plt.plot(x2, y2, label = "recall")

    plt.xlabel('threshold')
    # Set the y axis label of the current axis.
    plt.ylabel('precision/recall')
    # Set a title of the current axes.
    plt.title('Different thresholds when companies identified as competitors')
    # show a legend on the plot
    plt.legend()
    # Display a figure.
    plt.show()
    return

In [None]:
plot_df_metrics(all_comps)

In [None]:
choose_best_threshold(all_comps)

## 8.4 Compare gt_interview with gt_SBI codes

In [619]:
def compare_ground_truths(gt_interviews, gt_sbicodes):
    """Calculates the metrics of SBIcodes with respect to interviews"""
    TP = 0
    FP = 0
    FN = 0
    
    for company in gt_interviews:
        ground_truth = gt_interviews[company]
        own_results = gt_sbicodes[company]

        for comp in own_results:
            if comp in ground_truth:
                TP += 1
            else:
                FP += 1
        for comp in ground_truth:
            if comp not in own_results:
                FN += 1
        
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = calc_f1_score(recall,precision)
    
    return {'precision' : precision, 'recall' : recall, 'f1_score' : f1_score, 'TP' : TP, 'FP' : FP, 'FN' : FN}

compare_ground_truths(gt_interviews_list, gt_sbicodes)

{'precision': 0.17763157894736842,
 'recall': 0.36486486486486486,
 'f1_score': 0.23893805309734514,
 'TP': 27,
 'FP': 125,
 'FN': 47}

## 6.5 Calculate Sector distinguishing metric

In [741]:
def industry_difference(companyName, selection, gt_research):
    company_distance = []
    for company in selection:
        company_distance.append(gt_research[companyName][company])
    return sum(company_distance) / len(company_distance)

In [823]:
import statistics 
import numpy as np

def compute_sectorDis(data_input, distance_algorithm, post_level, user_level, IDF_penalty, company_names=company_names):
    if IDF_penalty == 'yes': idf = '-TFIDF'
    else: idf = ''
    
    gt_research = obtain_research_gt(load_distance_matrix(data_input, distance_algorithm, post_level, user_level, IDF_penalty))

    company_industry_matrix = {}
    companyTypes = [furniture_companies, sport_companies, food_companies]
    TypeNames = ['furniture', 'sport', 'food']
    
    #### STEP 1 - CREATING THE DISTANCE MATRIX ####
    for i in range(3): 
        companyType = companyTypes[i]
        #Looping through all the companies in one of the three categories
        category_averages = {}

        furniture_values = [industry_difference(company, furniture_companies, gt_research) for company in companyType]
        sport_values = [industry_difference(company, sport_companies, gt_research) for company in companyType]
        food_values = [industry_difference(company, food_companies, gt_research) for company in companyType]


        furniture_average = sum(furniture_values) / len(furniture_values)
        sport_average = sum(sport_values) / len(sport_values)
        food_average = sum(food_values) / len(food_values)

        company_industry_matrix[TypeNames[i]] = {'furniture' : furniture_average,
                                                'sport' : sport_average,
                                                'food' : food_average}

    average_matrix_industries = pd.DataFrame(company_industry_matrix)
    # reversing order of columns so it reads easier as a matrix.
    average_matrix_industries = average_matrix_industries[['food', 'furniture', 'sport']]
    dataframe_dict = average_matrix_industries.to_dict()
    print(dataframe_dict)
    #### STEP 2 -- COMPUTING THE SECTOR DIFFERENCE VALUES
    average_values = []

    for industry in dataframe_dict:
        inter_value = dataframe_dict[industry][industry]

        for other_industry in dataframe_dict[industry]:
            if other_industry != industry:
                other_value = dataframe_dict[industry][other_industry]
                #print(industry, other_industry)
                #print( (inter_value - other_value) / other_value * 100 )
                average_values.append( (inter_value - other_value) / other_value * 100 )
                #print(' ')
                
    #### STEP 3 -- COMPUTING THE AVERAGE
    return sum(average_values) / len(average_values)

In [854]:
compute_sectorDis('text', 'cosine', 'absolute', 'absolute', 'no')

{'food': {'food': 0.8607984195265673, 'furniture': 0.7594708843163696, 'sport': 0.7251137598281161}, 'furniture': {'food': 0.7594708843163696, 'furniture': 0.7772341752145724, 'sport': 0.7118629791726878}, 'sport': {'food': 0.7251137598281162, 'furniture': 0.7118629791726877, 'sport': 0.735548833728394}}


8.05707782148867

In [726]:
import statistics 
import numpy as np
def industry_difference(companyName, selection, gt_research):
    company_distance = []
    for company in selection:
        company_distance.append(gt_research[companyName][company])
    return sum(company_distance) / len(company_distance)

In [825]:
data_input = 'text'
distance_algorithm = 'cosine'
post_level = 'relative'
user_level = 'relative'
IDF_penalty = 'yes'

gt_research = obtain_research_gt(load_distance_matrix(data_input, distance_algorithm, post_level, user_level, IDF_penalty))

company_industry_matrix = {}
companyTypes = [furniture_companies, sport_companies, food_companies]
TypeNames = ['furniture', 'sport', 'food']

for i in range(3): 
    companyType = companyTypes[i]
    #Looping through all the companies in one of the three categories
    category_averages = {}
    
    furniture_values = [industry_difference(company, furniture_companies, gt_research) for company in companyType]
    sport_values = [industry_difference(company, sport_companies, gt_research) for company in companyType]
    food_values = [industry_difference(company, food_companies, gt_research) for company in companyType]
    
    
    furniture_average = sum(furniture_values) / len(furniture_values)
    sport_average = sum(sport_values) / len(sport_values)
    food_average = sum(food_values) / len(food_values)
    
    company_industry_matrix[TypeNames[i]] = {'furniture' : furniture_average,
                                            'sport' : sport_average,
                                            'food' : food_average}

average_matrix_industries = pd.DataFrame(company_industry_matrix)
# reversing order of columns so it reads easier as a matrix.
average_matrix_industries = average_matrix_industries[['food', 'furniture', 'sport']]
average_matrix_industries

Unnamed: 0,food,furniture,sport
food,0.083465,0.003718,0.002414
furniture,0.003718,0.025541,0.003103
sport,0.002414,0.003103,0.081391


In [729]:
# dataframe_dict = average_matrix_industries.to_dict()
average_values = []

for industry in dataframe_dict:
    inter_value = dataframe_dict[industry][industry]
    
    for other_industry in dataframe_dict[industry]:
        if other_industry != industry:
            other_value = dataframe_dict[industry][other_industry]
            #print(industry, other_industry)
            #print( (inter_value - other_value) / other_value * 100 )
            average_values.append( (inter_value - other_value) / other_value * 100 )
            #print(' ')