In [82]:
import pickle
import json
import fasttext
import fasttext.util
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt


from sklearn.ensemble import GradientBoostingClassifier
from Scripts.util import stringPreprocessing
from Scripts.graphgeneration import gbm_classifier

# Testing H1 on keywords
This notebook aims to test our hypothesis: Does the keywords associated with the concepts are also pre-requesites to keywords from sub concepts.

## Load Data

In [3]:
labeled_concept_pair_df = pd.read_csv("Data/labeled_concept_pair_dataset.csv")
labeled_concept_pair_df.tail()

Unnamed: 0,kw1,kw2,label
25934,software testing debugging,combinational synthesis,0
25935,combinational synthesis,power management,0
25936,software fault tolerance,pcb design layout,0
25937,pcb design layout,formal method,0
25938,documentation,pcb design layout,0


In [44]:
with open('Data/count_ccs_kw.json', 'r') as fp:
    count_ccs_kw = json.load(fp)

## Load Model

In [95]:
gbm_model1 = pickle.load(open("Models/naive_gridsearch_01_classifier.pkl", "rb"))
gbm_model1

GradientBoostingClassifier(max_depth=8, max_features='sqrt',
                           min_samples_leaf=20, min_samples_split=20,
                           n_estimators=900, random_state=10, subsample=0.8)

In [66]:
ft_model = fasttext.load_model('Models/cc.en.300.bin')



## Percentage of subwords in keyword pairs

### In total

In [7]:
def subword_is_in_both_kw(kw1, kw2):
    for subwords in kw1.split():
        if subwords in kw2:
            return 1
    
    return 0

In [23]:
sub_words = labeled_concept_pair_df.apply(lambda x: subword_is_in_both_kw(x.kw1, x.kw2), axis=1)

percentage = sub_words.sum()/sub_words.shape[0]*100
print("Percentage of keyword pairs with subwords in common: {0:.2f}%".format( percentage))

Percentage of keyword pairs with subwords in common: 7.89%


### Just the related words

In [25]:
sub_words = labeled_concept_pair_df[labeled_concept_pair_df.label == 1].apply(lambda x: subword_is_in_both_kw(x.kw1, x.kw2), axis=1)

percentage = sub_words.sum()/sub_words.shape[0]*100
print("Percentage of related keyword pairs with subwords in common: {0:.2f}%".format( percentage))

Percentage of related keyword pairs with subwords in common: 27.01%


### Just unrelated words

In [86]:
sub_words = labeled_concept_pair_df[labeled_concept_pair_df.label == 0].apply(lambda x: subword_is_in_both_kw(x.kw1, x.kw2), axis=1)

percentage = sub_words.sum()/sub_words.shape[0]*100
print("Percentage of related keyword pairs with subwords in common: {0:.2f}%".format( percentage))

Percentage of related keyword pairs with subwords in common: 1.76%


## Testing the Hypothesis

### Converting CCS concept strings to simple concepts

In [47]:
CCS_list = list(count_ccs_kw.keys())

In [60]:
Concept_to_CCS = {stringPreprocessing(el.split("->")[-1]): el for el in CCS_list if "->" in el}

### Observing number of keywords that are shared by successors

In [77]:
def get_rel_from_kw_from_concepts_inher(c1, c2):
    acc_keys = Concept_to_CCS.keys()
    if(c1 in acc_keys and c2 in acc_keys):
        kws1 = count_ccs_kw[Concept_to_CCS[c1]].keys()
        kws2 = count_ccs_kw[Concept_to_CCS[c2]].keys()

        for kw in kws1:
            if(kw in kws2):
                return 1
    
    return 0

In [78]:
labeled1_concept_pair_df = labeled_concept_pair_df[labeled_concept_pair_df.label == 1]
get_rel_from_kw_from_concepts_inher(labeled1_concept_pair_df.iloc[1,0], labeled1_concept_pair_df.iloc[1,1])

1

In [79]:
inclusion = labeled_concept_pair_df.apply(lambda x: get_rel_from_kw_from_concepts_inher(x.kw1, x.kw2), axis=1)

percentage = inclusion.sum()/inclusion.shape[0]*100
print("Percentage of keyword included in their successors: {0:.2f}%".format( percentage))

Percentage of keyword included in their successors: 18.85%


### Keywords that are not inherited by successors

In [87]:
def get_rel_from_kw_from_concepts_notinher(c1, c2):
    acc_keys = Concept_to_CCS.keys()
    if(c1 in acc_keys and c2 in acc_keys):
        kws1 = count_ccs_kw[Concept_to_CCS[c1]].keys()
        kws2 = count_ccs_kw[Concept_to_CCS[c2]].keys()

        for kw in kws1:
            if(kw not in kws2):
                return 1
    
    return 0

In [88]:
labeled1_concept_pair_df = labeled_concept_pair_df[labeled_concept_pair_df.label == 1]
get_rel_from_kw_from_concepts_notinher(labeled1_concept_pair_df.iloc[1,0], labeled1_concept_pair_df.iloc[1,1])

1

In [89]:
inclusion = labeled_concept_pair_df.apply(lambda x: get_rel_from_kw_from_concepts_notinher(x.kw1, x.kw2), axis=1)

percentage = inclusion.sum()/inclusion.shape[0]*100
print("Percentage of keyword not included in their successors: {0:.2f}%".format( percentage))

Percentage of keyword included in their successors: 22.86%


### The classifier test

In [98]:
def get_rel_from_kw_from_concepts_prereq(c1, c2):
    acc_keys = Concept_to_CCS.keys()
    if(c1 in acc_keys and c2 in acc_keys):
        kws1 = count_ccs_kw[Concept_to_CCS[c1]].keys()
        kws2 = count_ccs_kw[Concept_to_CCS[c2]].keys()

        for kw1 in kws1:
            for kw2 in kws2:
                class_res = gbm_classifier(ft_model.get_sentence_vector, gbm_model1.predict, (kw1, kw2))
                if(class_res):
                    return class_res
    
    return 0

In [99]:
get_rel_from_kw_from_concepts_prereq(labeled1_concept_pair_df.iloc[1,0], labeled1_concept_pair_df.iloc[1,1])

1

In [100]:
prereq = labeled_concept_pair_df.apply(lambda x: get_rel_from_kw_from_concepts_prereq(x.kw1, x.kw2), axis=1)

percentage = prereq.sum()/prereq.shape[0]*100
print("Percentage of keyword that are prerequesite for their successors: {0:.2f}%".format( percentage))

Percentage of keyword that are prerequesite for their successors: 20.49%
