In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [8]:
from spellchecker import SpellChecker
from nltk import word_tokenize

def is_abbrev_single(a):
    
    a = a.strip()
    
    if a.isupper() and len(word_tokenize(a)) == 0:
        return True
    
    return False

def is_abbrev(a, b):
        
    a_abbrev = ""
    for word in word_tokenize(a.replace("-", " ")):
        a_abbrev += word[0]
        
    if(a_abbrev.strip().lower() == b.strip().lower()):
        return True
    
    b_abbrev = ""
    for word in word_tokenize(b.replace("-", " ")):
        b_abbrev += word[0]
            
    if(b_abbrev.strip().lower() == a.strip().lower()):
        return True
    
    return False

#Returns True if both concepts, a and b refer to the same thing, False otherwise
def entity_resolution(a, b):

    a = a.strip()
    b = b.strip()
    
    #Lowercase
    if a.lower() == b.lower():
        print("Reason: Lowercase")
        return True
    
    a = a.lower()
    b = b.lower()
    
    if is_abbrev(a, b):
        print("Reason: abbrev")
        return True

    #Jumbled
    words_a = [i for i in word_tokenize(a.replace("-", " "))]
    words_b = [i for i in word_tokenize(b.replace("-", " "))]
    if set(words_a) == set(words_b):
        print("Reason: Jumbled/Hyphen")
        return True

    #Substring
    if a in b:
        b_rest = b.replace(a, "").replace("(", "").replace(")", "")
        if is_abbrev(a, b_rest):
            print("Reason: abbrev1")
            return True

    if b in a:
        a_rest = a.replace(b, "").replace("(", "").replace(")", "")
        if is_abbrev(b, a_rest):
            print("Reason: abbrev2")
            return True

    return False

In [11]:
data = pd.read_csv('../results/Networking_concepts_features.csv', engine='python')
data.head()

Unnamed: 0,ConceptA,ConceptB,ChapterDist,PageDist,LineDist,WordDist,Total,LineCount,WordCount,Complexity
0,abramson,norman,1873,5743,0,7,16,2,1,15
1,abramson,access control lists,-1791,-5009,0,0,16,0,0,15
2,abramson,access delay,3875,12861,0,0,48,0,0,13
3,abramson,access isp,44652,136996,0,0,448,0,0,-12
4,abramson,access networks,8674,15614,0,31,1056,7,1,-50


In [93]:
gt = pd.read_csv('../AL-CPL/features/proc_network_relation_v2.csv')
gt['Prereq'].unique()

array([1, 0], dtype=int64)

In [32]:
pred_concepts = list(data['ConceptA'])
pred_concepts += list(data['ConceptB'])
pred_concepts = list(set(pred_concepts))
pred_concepts = [pred_concept.lower() for pred_concept in pred_concepts]
pred_total = len(pred_concepts)
print(pred_concepts)

['best-effort networks', 'costs', 'idea', 'registrars', 'route summarization', 'selective acknowledgment', 'development', 'standards', 'block', 'source-specific congestion-control actions', 'ip addresses', 'software control plane', 'ipv4 header', '802.11i', 'time slots', 'tcp ports', 'traffic engineering', 'transport-layer protocols', 'security management', 'continued evolution of', '2g', 'instantaneous', 'known-plaintext attack', 'initial state', 'bottleneck link', 'distance vector', 'hosts', 'imap server', 'foreign address', 'packet-forwarding decisions', 'service classes', 'simon s.', 'overlay network', 'bandwidth sensitive', 'load balancing', 'query arp message', 'network protocols', 'congestion window', 'destination', 'network management data', 'congestion avoidance', 'hidden terminal problem', 'switch poisoning', 'traffic policing', 'overlapping fragments', 'client side of http', 'links', 'application-layer', 'source address', 'client-server architecture', 'connection replay atta

In [95]:
gt_concepts = list(gt['ConceptA'])
gt_concepts += list(gt['ConceptB'])
gt_concepts = list(set(gt_concepts))
gt_concepts = [gt_concept.lower() for gt_concept in gt_concepts]
actual_total = len(gt_concepts)
print(gt_concepts)

['file transfer protocol', 'bluetooth', 'asynchronous transfer mode', 'physical layer', 'network delay', 'file transfer', 'ethernet frame', 'domain name', 'stateless protocol', 'list of dns record types', 'network packet', 'traffic intensity', 'channel access method', 'attack (computing)', 'minimum spanning tree', 'virtual lan', 'ipv6', 'network interface controller', 'voice over ip', 'carrier sense multiple access with collision avoidance', 'internet protocol suite', 'list of http header fields', 'data center', 'open shortest path first', 'digital subscriber line access multiplexer', 'bit error rate', 'interference (communication)', 'threat (computer)', 'routing', '4g', 'store and forward', 'client (computing)', '2g', 'broadband', 'encapsulation (networking)', 'http cookie', 'client-server model', 'network layer', 'end system', 'lte (telecommunication)', 'data link layer', 'root name server', 'name server', 'carrier sense multiple access with collision detection', 'peer-to-peer', 'ses

In [96]:
count = 0
prev_count = 0
i = 0
for gt_concept in gt_concepts:
    for concept in pred_concepts:
        #if entity_resolution(gt_concept, concept):
        if (gt_concept in concept) or (concept in gt_concept):
            count += 1
            break
    if count == prev_count:
        print(gt_concept)
    prev_count = count
#     i = i+1
#     print("Done", i)
print(count/actual_total)

network delay
domain name
network packet
channel access method
virtual lan
network interface controller
bit error rate
interference (communication)
threat (computer)
store and forward
broadband
client-server model
lte (telecommunication)
root name server
peer-to-peer
dsl modem
base transceiver station
top-level domain
simple mail transfer protocol
address resolution protocol
twisted pair
network socket
domain name system
bottleneck (network)
email
network delay
selective repeat arq
computer virus
random early detection
subnetwork
osi model
post office protocol
automatic repeat request
network scheduler
distributed hash table
computer worm
traffic contract
network congestion
slow-start
switched fabric
hypertext transfer protocol
network address translation
virtual circuit
denial-of-service attack
forward error correction
wi-fi
optical fiber
go-back-n arq
negative-acknowledge character
inter-process communication
0.75


In [99]:
dat_list = data.values.tolist()
gt_list = gt.values.tolist()

In [100]:
def check_same(a,b):
    a = a.lower()
    b = b.lower()
    
    if (a in b) or (b in a):
        return True
    else:
        return False
    

train_set = []
i = 0
for gt_val in gt_list:
    for dat_val in dat_list:
        if check_same(gt_val[0],dat_val[0]) and check_same(gt_val[1],dat_val[1]):
            train_set.append([gt_val[0],gt_val[1],*dat_val[2:],gt_val[2]])
            break
    i += 1
    print("Concept ", i, " is done")
    

Concept  1  is done
Concept  2  is done
Concept  3  is done
Concept  4  is done
Concept  5  is done
Concept  6  is done
Concept  7  is done
Concept  8  is done
Concept  9  is done
Concept  10  is done
Concept  11  is done
Concept  12  is done
Concept  13  is done
Concept  14  is done
Concept  15  is done
Concept  16  is done
Concept  17  is done
Concept  18  is done
Concept  19  is done
Concept  20  is done
Concept  21  is done
Concept  22  is done
Concept  23  is done
Concept  24  is done
Concept  25  is done
Concept  26  is done
Concept  27  is done
Concept  28  is done
Concept  29  is done
Concept  30  is done
Concept  31  is done
Concept  32  is done
Concept  33  is done
Concept  34  is done
Concept  35  is done
Concept  36  is done
Concept  37  is done
Concept  38  is done
Concept  39  is done
Concept  40  is done
Concept  41  is done
Concept  42  is done
Concept  43  is done
Concept  44  is done
Concept  45  is done
Concept  46  is done
Concept  47  is done
Concept  48  is done
C

Concept  379  is done
Concept  380  is done
Concept  381  is done
Concept  382  is done
Concept  383  is done
Concept  384  is done
Concept  385  is done
Concept  386  is done
Concept  387  is done
Concept  388  is done
Concept  389  is done
Concept  390  is done
Concept  391  is done
Concept  392  is done
Concept  393  is done
Concept  394  is done
Concept  395  is done
Concept  396  is done
Concept  397  is done
Concept  398  is done
Concept  399  is done
Concept  400  is done
Concept  401  is done
Concept  402  is done
Concept  403  is done
Concept  404  is done
Concept  405  is done
Concept  406  is done
Concept  407  is done
Concept  408  is done
Concept  409  is done
Concept  410  is done
Concept  411  is done
Concept  412  is done
Concept  413  is done
Concept  414  is done
Concept  415  is done
Concept  416  is done
Concept  417  is done
Concept  418  is done
Concept  419  is done
Concept  420  is done
Concept  421  is done
Concept  422  is done
Concept  423  is done
Concept  4

Concept  755  is done
Concept  756  is done
Concept  757  is done
Concept  758  is done
Concept  759  is done
Concept  760  is done
Concept  761  is done
Concept  762  is done
Concept  763  is done
Concept  764  is done
Concept  765  is done
Concept  766  is done
Concept  767  is done
Concept  768  is done
Concept  769  is done
Concept  770  is done
Concept  771  is done
Concept  772  is done
Concept  773  is done
Concept  774  is done
Concept  775  is done
Concept  776  is done
Concept  777  is done
Concept  778  is done
Concept  779  is done
Concept  780  is done
Concept  781  is done
Concept  782  is done
Concept  783  is done
Concept  784  is done
Concept  785  is done
Concept  786  is done
Concept  787  is done
Concept  788  is done
Concept  789  is done
Concept  790  is done
Concept  791  is done
Concept  792  is done
Concept  793  is done
Concept  794  is done
Concept  795  is done
Concept  796  is done
Concept  797  is done
Concept  798  is done
Concept  799  is done
Concept  8

Concept  1122  is done
Concept  1123  is done
Concept  1124  is done
Concept  1125  is done
Concept  1126  is done
Concept  1127  is done
Concept  1128  is done
Concept  1129  is done
Concept  1130  is done
Concept  1131  is done
Concept  1132  is done
Concept  1133  is done
Concept  1134  is done
Concept  1135  is done
Concept  1136  is done
Concept  1137  is done
Concept  1138  is done
Concept  1139  is done
Concept  1140  is done
Concept  1141  is done
Concept  1142  is done
Concept  1143  is done
Concept  1144  is done
Concept  1145  is done
Concept  1146  is done
Concept  1147  is done
Concept  1148  is done
Concept  1149  is done
Concept  1150  is done
Concept  1151  is done
Concept  1152  is done
Concept  1153  is done
Concept  1154  is done
Concept  1155  is done
Concept  1156  is done
Concept  1157  is done
Concept  1158  is done
Concept  1159  is done
Concept  1160  is done
Concept  1161  is done
Concept  1162  is done
Concept  1163  is done
Concept  1164  is done
Concept  11

Concept  1480  is done
Concept  1481  is done
Concept  1482  is done
Concept  1483  is done
Concept  1484  is done
Concept  1485  is done
Concept  1486  is done
Concept  1487  is done
Concept  1488  is done
Concept  1489  is done
Concept  1490  is done
Concept  1491  is done
Concept  1492  is done
Concept  1493  is done
Concept  1494  is done
Concept  1495  is done
Concept  1496  is done
Concept  1497  is done
Concept  1498  is done
Concept  1499  is done
Concept  1500  is done


In [101]:
import csv
with open("../results/train_data", 'w', newline="") as concept_file:
    wr = csv.writer(concept_file)
    #wr.writerows(['ConceptA',])
    wr.writerows(train_set)

In [120]:
clean_data = pd.read_csv("../results/train_data")
clean_data.head()
clean_data.shape

(871, 11)

In [103]:
def correct(x):
    if x == 0:
        return 1
    else:
        return x


clean_data['Avg_ChapterDist'] = clean_data['ChapterDist']/clean_data['Total']
clean_data['Avg_PageDist'] = clean_data['PageDist']/clean_data['Total']
clean_data['LineCount'] = clean_data['LineCount'].apply(lambda x: correct(x))
clean_data['WordCount'] = clean_data['WordCount'].apply(lambda x: correct(x))
clean_data['Avg_LineDist'] = clean_data['LineDist']/clean_data['LineCount']
clean_data['Avg_WordDist'] = clean_data['WordDist']/clean_data['WordCount']

In [104]:
clean_data.head()

Unnamed: 0,ConceptA,ConceptB,ChapterDist,PageDist,LineDist,WordDist,Total,LineCount,WordCount,Complexity,Prereq,Avg_ChapterDist,Avg_PageDist,Avg_LineDist,Avg_WordDist
0,Computer network,End system,706449,2145756,0,-306,10602,66,4,21,1,66.63356,202.391624,0.0,-76.5
1,End system,Server (computing),-386088,-1190037,0,0,4185,7,1,48,0,-92.255197,-284.357706,0.0,0.0
2,End system,Router (computing),-386088,-1190037,0,0,4185,7,1,48,0,-92.255197,-284.357706,0.0,0.0
3,Computer network,Network security,-2233404,-6659796,0,314,26676,114,43,-120,1,-83.723347,-249.654971,0.0,7.302326
4,Computer network,Link layer,7851,22500,0,0,342,1,1,111,1,22.95614,65.789474,0.0,0.0


In [105]:
train_clean_data = clean_data[['Complexity','Avg_ChapterDist','Avg_PageDist','Avg_LineDist','Avg_WordDist','Prereq']]
train_clean_data.head()

Unnamed: 0,Complexity,Avg_ChapterDist,Avg_PageDist,Avg_LineDist,Avg_WordDist,Prereq
0,21,66.63356,202.391624,0.0,-76.5,1
1,48,-92.255197,-284.357706,0.0,0.0,0
2,48,-92.255197,-284.357706,0.0,0.0,0
3,-120,-83.723347,-249.654971,0.0,7.302326,1
4,111,22.95614,65.789474,0.0,0.0,1


In [106]:
train_clean_data['Prereq'].unique()

array([1, 0], dtype=int64)

In [107]:
X = train_clean_data.loc[:, clean_data.columns != 'Prereq']
y = train_clean_data.loc[:, clean_data.columns == 'Prereq']

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

In [144]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Lenght of train: ", len(X_train))
print("Length of test: ", len(X_test))

tn, fp, fn, tp  = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 Score : ", f1_score(y_test, y_pred))

Lenght of train:  696
Length of test:  175
126 0 47 2
Precision :  1.0
Recall :  0.04081632653061224
F1 Score :  0.07843137254901959


  y = column_or_1d(y, warn=True)


In [145]:
os = SMOTE(random_state=0)

columns = X_train.columns
os_data_X, os_data_y = os.fit_sample(X_train, y_train)

# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['Prereq']==0]))
print("Number of subscription",len(os_data_y[os_data_y['Prereq']==1]))

logreg = LogisticRegression(max_iter=1000)
logreg.fit(os_data_X, os_data_y)
y_pred = logreg.predict(X_test)

tn, fp, fn, tp  = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 Score : ", f1_score(y_test, y_pred))

length of oversampled data is  1004
Number of no subscription in oversampled data 502
Number of subscription 502
78 48 21 28
Precision :  0.3684210526315789
Recall :  0.5714285714285714
F1 Score :  0.44799999999999995


  y = column_or_1d(y, warn=True)


In [174]:
# train model
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)
rfc = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)

# predict on test set
y_pred = rfc.predict(X_test)
tn, fp, fn, tp  = confusion_matrix(np.array(y_test), np.array(y_pred)).ravel()
print(tn, fp, fn, tp)
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 Score : ", f1_score(y_test, y_pred))

  after removing the cwd from sys.path.


111 15 14 35
Precision :  0.7
Recall :  0.7142857142857143
F1 Score :  0.7070707070707072


In [176]:
clean_data['pred'] = rfc.predict(X)

In [177]:
clean_data[['ConceptA','ConceptB','pred']]

Unnamed: 0,ConceptA,ConceptB,pred
0,Computer network,End system,1
1,End system,Server (computing),0
2,End system,Router (computing),0
3,Computer network,Network security,1
4,Computer network,Link layer,1
...,...,...,...
866,Handshaking,HTTP persistent connection,1
867,HTTP persistent connection,Web page,0
868,HTTP persistent connection,Transmission Control Protocol,0
869,List of HTTP header fields,Port (computer networking),0


In [182]:
clean_data[['ConceptA','ConceptB','pred']].to_csv('../results/pred_data',index=False)