In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [8]:
from spellchecker import SpellChecker
from nltk import word_tokenize

def is_abbrev_single(a):
    
    a = a.strip()
    
    if a.isupper() and len(word_tokenize(a)) == 0:
        return True
    
    return False

def is_abbrev(a, b):
        
    a_abbrev = ""
    for word in word_tokenize(a.replace("-", " ")):
        a_abbrev += word[0]
        
    if(a_abbrev.strip().lower() == b.strip().lower()):
        return True
    
    b_abbrev = ""
    for word in word_tokenize(b.replace("-", " ")):
        b_abbrev += word[0]
            
    if(b_abbrev.strip().lower() == a.strip().lower()):
        return True
    
    return False

#Returns True if both concepts, a and b refer to the same thing, False otherwise
def entity_resolution(a, b):

    a = a.strip()
    b = b.strip()
    
    #Lowercase
    if a.lower() == b.lower():
        print("Reason: Lowercase")
        return True
    
    a = a.lower()
    b = b.lower()
    
    if is_abbrev(a, b):
        print("Reason: abbrev")
        return True

    #Jumbled
    words_a = [i for i in word_tokenize(a.replace("-", " "))]
    words_b = [i for i in word_tokenize(b.replace("-", " "))]
    if set(words_a) == set(words_b):
        print("Reason: Jumbled/Hyphen")
        return True

    #Spelling mistakes
    spell = SpellChecker()
    misspelled_a = spell.unknown(words_a)
    misspelled_b = spell.unknown(words_b)
    for misspelled_word in misspelled_a:
        words_a[words_a.index(misspelled_word)] = spell.correction(misspelled_word)
        a = a.replace(misspelled_word, spell.correction(misspelled_word))
    for misspelled_word in misspelled_b:
        words_b[words_b.index(misspelled_word)] = spell.correction(misspelled_word)
        b = b.replace(misspelled_word, spell.correction(misspelled_word))

    if set(words_a) == set(words_b):
        print("Reason: Spelling Mistake/Jumbled")
        return True

    #Substring
    if a in b:
        b_rest = b.replace(a, "").replace("(", "").replace(")", "")
        if is_abbrev(a, b_rest):
            print("Reason: abbrev1")
            return True

    if b in a:
        a_rest = a.replace(b, "").replace("(", "").replace(")", "")
        if is_abbrev(b, a_rest):
            print("Reason: abbrev2")
            return True

    return False

In [11]:
data = pd.read_csv('../results/Networking_concepts_features.csv', engine='python')
data.head()

Unnamed: 0,ConceptA,ConceptB,ChapterDist,PageDist,LineDist,WordDist,Total,LineCount,WordCount,Complexity
0,abramson,norman,1873,5743,0,7,16,2,1,15
1,abramson,access control lists,-1791,-5009,0,0,16,0,0,15
2,abramson,access delay,3875,12861,0,0,48,0,0,13
3,abramson,access isp,44652,136996,0,0,448,0,0,-12
4,abramson,access networks,8674,15614,0,31,1056,7,1,-50


In [24]:
gt = pd.read_csv('../AL-CPL/features/proc_network_relation_v2.csv')
gt.head()

Unnamed: 0,ConceptA,ConceptB,Prereq
0,Computer network,End system,1
1,End system,Server (computing),0
2,End system,Router (computing),0
3,End system,Virtual circuit,0
4,Computer network,Network security,1


In [32]:
pred_concepts = list(data['ConceptA'])
pred_concepts += list(data['ConceptB'])
pred_concepts = list(set(pred_concepts))
pred_concepts = [pred_concept.lower() for pred_concept in pred_concepts]
pred_total = len(pred_concepts)
print(pred_concepts)

['best-effort networks', 'costs', 'idea', 'registrars', 'route summarization', 'selective acknowledgment', 'development', 'standards', 'block', 'source-specific congestion-control actions', 'ip addresses', 'software control plane', 'ipv4 header', '802.11i', 'time slots', 'tcp ports', 'traffic engineering', 'transport-layer protocols', 'security management', 'continued evolution of', '2g', 'instantaneous', 'known-plaintext attack', 'initial state', 'bottleneck link', 'distance vector', 'hosts', 'imap server', 'foreign address', 'packet-forwarding decisions', 'service classes', 'simon s.', 'overlay network', 'bandwidth sensitive', 'load balancing', 'query arp message', 'network protocols', 'congestion window', 'destination', 'network management data', 'congestion avoidance', 'hidden terminal problem', 'switch poisoning', 'traffic policing', 'overlapping fragments', 'client side of http', 'links', 'application-layer', 'source address', 'client-server architecture', 'connection replay atta

In [34]:
gt_concepts = list(gt['ConceptA'])
gt_concepts += list(gt['ConceptB'])
gt_concepts = list(set(gt_concepts))
gt_concepts = [gt_concept.lower() for gt_concept in gt_concepts]
actual_total = len(gt_concepts)
print(gt_concepts)

['file transfer protocol', 'bluetooth', 'asynchronous transfer mode', 'physical layer', 'network delay', 'file transfer', 'ethernet frame', 'domain name', 'stateless protocol', 'list of dns record types', 'network packet', 'traffic intensity', 'channel access method', 'attack (computing)', 'minimum spanning tree', 'virtual lan', 'ipv6', 'network interface controller', 'voice over ip', 'carrier sense multiple access with collision avoidance', 'internet protocol suite', 'list of http header fields', 'data center', 'open shortest path first', 'digital subscriber line access multiplexer', 'bit error rate', 'interference (communication)', 'threat (computer)', 'routing', '4g', 'store and forward', 'client (computing)', '2g', 'broadband', 'encapsulation (networking)', 'http cookie', 'client-server model', 'network layer', 'end system', 'lte (telecommunication)', 'data link layer', 'root name server', 'name server', 'carrier sense multiple access with collision detection', 'peer-to-peer', 'ses

In [40]:
count = 0
i = 0
for gt_concept in gt_concepts:
    for concept in pred_concepts:
        if (gt_concept in concept) or (concept in gt_concept):
            count += 1
            break
print(count/actual_total)

0.75


In [42]:
gt.values.to_list()

AttributeError: 'numpy.ndarray' object has no attribute 'to_list'