In [1]:
import pandas as pd
import numpy as np
import random
np.random.seed(0)
random.seed(None)
import csv

import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
filename = 'Networking_concepts_features.csv'
data = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','ChapterDist','PageDist',
                                       'LineDist','WordDist','Total','LineCount','WordCount','Complexity'])
data.head()

Unnamed: 0,ConceptA,ConceptB,ChapterDist,PageDist,LineDist,WordDist,Total,LineCount,WordCount,Complexity
0,ip address,spyware,1301,2153,0,0,66,0,0,31
1,ip address,internet protocol,21736,44935,0,202,1815,4,1,-22
2,ip address,end-to-end delay,37130,151586,0,0,957,0,0,4
3,ip address,lte,-69202,-283942,0,-59,4818,12,1,-113
4,ip address,active queue management,1162,4459,0,0,33,0,0,32


In [3]:
filename = 'Networking_concepts_additional'
data_addn = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','Chap_Con','Dad'])
data_addn.head()

Unnamed: 0,ConceptA,ConceptB,Chap_Con,Dad
0,ip address,spyware,0.6,1000.0
1,ip address,internet protocol,0.152632,1000.0
2,ip address,end-to-end delay,0.166667,1000.0
3,ip address,lte,0.122222,1000.0
4,ip address,active queue management,1.1,1000.0


In [4]:
final_data = pd.merge(data, data_addn, on=['ConceptA','ConceptB'], how='inner')
data = final_data[['ConceptA','ConceptB','ChapterDist','PageDist',
                    'LineDist','WordDist','Total','LineCount','WordCount','Complexity','Chap_Con',
                    'Dad']]

In [5]:
# get the concepts from ground truth data
filename = 'proc_network_relation_v2.csv'
gt = pd.read_csv('../AL-CPL/features/'+filename, 
                     header=None, names=['ConceptA','ConceptB','Prereq'])
gt['Prereq'].unique()

array([1, 0], dtype=int64)

In [None]:
## Error Analysis : missing concepts
## get all concepts from concept extraction module
# pred_concepts = list(data['ConceptA'])
# pred_concepts += list(data['ConceptB'])
# pred_concepts = list(set(pred_concepts))
# pred_concepts = [concept.lower() for concept in pred_concepts]
# count_pred_concepts = len(pred_concepts)
# print(count_pred_concepts)

## get all concepts from ground truth
# gt_concepts = list(gt['ConceptA'])
# gt_concepts += list(gt['ConceptB'])
# gt_concepts = list(set(gt_concepts))
# gt_concepts = [gt_concept.lower() for gt_concept in gt_concepts]
# actual_total = len(gt_concepts)
# print(actual_total)

## Identify the concepts not in ground truth
# count = 0
# prev_count = 0
# i = 0
# for gt_concept in gt_concepts:
#     for concept in pred_concepts:
#         if (gt_concept in concept) or (concept in gt_concept):
#             count += 1
#             break
#     if count == prev_count:
#         print(gt_concept) # print concepts in predicted but not in ground truth

# left_out = actual_total - count
# print("Precision : ", count/(count+ (count_pred_concepts-count)))
# print("Recall : ", count/(count + left_out))

In [6]:
dat_list = data.values.tolist()
gt_list = gt.values.tolist()

def check_same(a,b):
    a = a.lower()
    b = b.lower()
    
    if (a in b) or (b in a):
        return True
    else:
        return False
    

train_set = []
i = 0
# filter the concept pairs + features such that ground truth concepts are filtered
for gt_val in gt_list:
    for dat_val in dat_list:
        if check_same(gt_val[0],dat_val[0]) and check_same(gt_val[1],dat_val[1]):
            train_set.append([gt_val[0],gt_val[1],*dat_val[2:],gt_val[2]])
            break
    i += 1
    print("Concept ", i, " is done")

# print sample from training data
print(train_set[0])

Concept  1  is done
Concept  2  is done
Concept  3  is done
Concept  4  is done
Concept  5  is done
Concept  6  is done
Concept  7  is done
Concept  8  is done
Concept  9  is done
Concept  10  is done
Concept  11  is done
Concept  12  is done
Concept  13  is done
Concept  14  is done
Concept  15  is done
Concept  16  is done
Concept  17  is done
Concept  18  is done
Concept  19  is done
Concept  20  is done
Concept  21  is done
Concept  22  is done
Concept  23  is done
Concept  24  is done
Concept  25  is done
Concept  26  is done
Concept  27  is done
Concept  28  is done
Concept  29  is done
Concept  30  is done
Concept  31  is done
Concept  32  is done
Concept  33  is done
Concept  34  is done
Concept  35  is done
Concept  36  is done
Concept  37  is done
Concept  38  is done
Concept  39  is done
Concept  40  is done
Concept  41  is done
Concept  42  is done
Concept  43  is done
Concept  44  is done
Concept  45  is done
Concept  46  is done
Concept  47  is done
Concept  48  is done
C

Concept  393  is done
Concept  394  is done
Concept  395  is done
Concept  396  is done
Concept  397  is done
Concept  398  is done
Concept  399  is done
Concept  400  is done
Concept  401  is done
Concept  402  is done
Concept  403  is done
Concept  404  is done
Concept  405  is done
Concept  406  is done
Concept  407  is done
Concept  408  is done
Concept  409  is done
Concept  410  is done
Concept  411  is done
Concept  412  is done
Concept  413  is done
Concept  414  is done
Concept  415  is done
Concept  416  is done
Concept  417  is done
Concept  418  is done
Concept  419  is done
Concept  420  is done
Concept  421  is done
Concept  422  is done
Concept  423  is done
Concept  424  is done
Concept  425  is done
Concept  426  is done
Concept  427  is done
Concept  428  is done
Concept  429  is done
Concept  430  is done
Concept  431  is done
Concept  432  is done
Concept  433  is done
Concept  434  is done
Concept  435  is done
Concept  436  is done
Concept  437  is done
Concept  4

Concept  777  is done
Concept  778  is done
Concept  779  is done
Concept  780  is done
Concept  781  is done
Concept  782  is done
Concept  783  is done
Concept  784  is done
Concept  785  is done
Concept  786  is done
Concept  787  is done
Concept  788  is done
Concept  789  is done
Concept  790  is done
Concept  791  is done
Concept  792  is done
Concept  793  is done
Concept  794  is done
Concept  795  is done
Concept  796  is done
Concept  797  is done
Concept  798  is done
Concept  799  is done
Concept  800  is done
Concept  801  is done
Concept  802  is done
Concept  803  is done
Concept  804  is done
Concept  805  is done
Concept  806  is done
Concept  807  is done
Concept  808  is done
Concept  809  is done
Concept  810  is done
Concept  811  is done
Concept  812  is done
Concept  813  is done
Concept  814  is done
Concept  815  is done
Concept  816  is done
Concept  817  is done
Concept  818  is done
Concept  819  is done
Concept  820  is done
Concept  821  is done
Concept  8

Concept  1152  is done
Concept  1153  is done
Concept  1154  is done
Concept  1155  is done
Concept  1156  is done
Concept  1157  is done
Concept  1158  is done
Concept  1159  is done
Concept  1160  is done
Concept  1161  is done
Concept  1162  is done
Concept  1163  is done
Concept  1164  is done
Concept  1165  is done
Concept  1166  is done
Concept  1167  is done
Concept  1168  is done
Concept  1169  is done
Concept  1170  is done
Concept  1171  is done
Concept  1172  is done
Concept  1173  is done
Concept  1174  is done
Concept  1175  is done
Concept  1176  is done
Concept  1177  is done
Concept  1178  is done
Concept  1179  is done
Concept  1180  is done
Concept  1181  is done
Concept  1182  is done
Concept  1183  is done
Concept  1184  is done
Concept  1185  is done
Concept  1186  is done
Concept  1187  is done
Concept  1188  is done
Concept  1189  is done
Concept  1190  is done
Concept  1191  is done
Concept  1192  is done
Concept  1193  is done
Concept  1194  is done
Concept  11

In [7]:
def correct(x):
    '''
    If arg is zero return 1,
    else return arg
    '''
    if x == 0:
        return 1
    else:
        return x

    
# save training data
with open("../results/train_data", 'w', newline="") as concept_file:
    wr = csv.writer(concept_file)
    wr.writerows(train_set)
# load clean data
clean_data = pd.read_csv("../results/train_data", header=None, names=['ConceptA','ConceptB','ChapterDist','PageDist',
                                       'LineDist','WordDist','Total','LineCount','WordCount','Complexity','Chap_Conce',
                                        'Dad_Score', 'Prereq'])
clean_data['Avg_ChapterDist'] = clean_data['ChapterDist']/clean_data['Total']
clean_data['Avg_PageDist'] = clean_data['PageDist']/clean_data['Total']
clean_data['LineCount'] = clean_data['LineCount'].apply(lambda x: correct(x)) # threshold line count, min=1
clean_data['WordCount'] = clean_data['WordCount'].apply(lambda x: correct(x)) # threshold word count, min=1
clean_data['Avg_LineDist'] = clean_data['LineDist']/clean_data['LineCount']
clean_data['Avg_WordDist'] = clean_data['WordDist']/clean_data['WordCount']
# calcualte trainig features
X = clean_data[['Complexity','Avg_ChapterDist','Avg_PageDist','Avg_LineDist',
                               'Avg_WordDist', 'Chap_Conce', 'Dad_Score']]
y = clean_data[['Prereq']]
# train model - Random Forest
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)
rfc = RandomForestClassifier(n_estimators=500, random_state = random.seed(1234)).fit(X_train, y_train)
# predict on test set
y_pred = rfc.predict(X_test)
tn, fp, fn, tp  = confusion_matrix(np.array(y_test), np.array(y_pred)).ravel()
print(tn, fp, fn, tp)
precision = tp / (tp+fp)
recall = tp / (tp+fn)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 Score : ", f1_score(y_test, y_pred))



161 11 27 40
Precision :  0.7843137254901961
Recall :  0.5970149253731343
F1 Score :  0.6779661016949152


## OLI Data

In [13]:
filename = 'olidata_features.csv'
data = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','ChapterDist','PageDist',
                                       'LineDist','WordDist','Total','LineCount','WordCount','Complexity'])
data.head()

Unnamed: 0,ConceptA,ConceptB,ChapterDist,PageDist,LineDist,WordDist,Total,LineCount,WordCount,Complexity
0,computing,multiple,1053,1465,0,0,100,0,0,-21
1,computing,one-hot encoding,49,93,0,0,4,0,0,3
2,computing,handling,195,363,0,0,12,0,0,1
3,computing,dropping,139,279,0,0,12,0,0,1
4,computing,applications,283,423,0,0,12,0,0,1


In [14]:
filename = 'olidata_additional'
data_addn = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','Chap_Con','Dad'])
data_addn.head()

Unnamed: 0,ConceptA,ConceptB,Chap_Con,Dad
0,computing,multiple,0.316667,1000.0
1,computing,one-hot encoding,1.25,1000.0
2,computing,handling,1.25,1000.0
3,computing,dropping,0.75,1000.0
4,computing,applications,0.583333,1000.0


In [15]:
final_data = pd.merge(data, data_addn, on=['ConceptA','ConceptB'], how='inner')
data = final_data[['ConceptA','ConceptB','ChapterDist','PageDist',
                    'LineDist','WordDist','Total','LineCount','WordCount','Complexity','Chap_Con',
                    'Dad']]
final_data.head()

Unnamed: 0,ConceptA,ConceptB,ChapterDist,PageDist,LineDist,WordDist,Total,LineCount,WordCount,Complexity,Chap_Con,Dad
0,computing,multiple,1053,1465,0,0,100,0,0,-21,0.316667,1000.0
1,computing,one-hot encoding,49,93,0,0,4,0,0,3,1.25,1000.0
2,computing,handling,195,363,0,0,12,0,0,1,1.25,1000.0
3,computing,dropping,139,279,0,0,12,0,0,1,0.75,1000.0
4,computing,applications,283,423,0,0,12,0,0,1,0.583333,1000.0


In [18]:
clean_data = final_data
clean_data['Avg_ChapterDist'] = clean_data['ChapterDist']/clean_data['Total']
clean_data['Avg_PageDist'] = clean_data['PageDist']/clean_data['Total']
clean_data['LineCount'] = clean_data['LineCount'].apply(lambda x: correct(x)) # threshold line count, min=1
clean_data['WordCount'] = clean_data['WordCount'].apply(lambda x: correct(x)) # threshold word count, min=1
clean_data['Avg_LineDist'] = clean_data['LineDist']/clean_data['LineCount']
clean_data['Avg_WordDist'] = clean_data['WordDist']/clean_data['WordCount']
# calcualte trainig features
X = clean_data[['Complexity','Avg_ChapterDist','Avg_PageDist','Avg_LineDist',
                               'Avg_WordDist', 'Chap_Con', 'Dad']]

In [19]:
out = clean_data[['ConceptA','ConceptB']]
out['pred'] = rfc.predict(X)
out.to_csv('OLIData', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
filename = 'pythondatasciencehandbook_concepts_features.csv'
data = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','ChapterDist','PageDist',
                                       'LineDist','WordDist','Total','LineCount','WordCount','Complexity'])
filename = 'pythondatasciencehandbook_concepts_additional'
data_addn = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','Chap_Con','Dad'])

final_data = pd.merge(data, data_addn, on=['ConceptA','ConceptB'], how='inner')
data = final_data[['ConceptA','ConceptB','ChapterDist','PageDist',
                    'LineDist','WordDist','Total','LineCount','WordCount','Complexity','Chap_Con',
                    'Dad']]


clean_data = final_data
clean_data['Avg_ChapterDist'] = clean_data['ChapterDist']/clean_data['Total']
clean_data['Avg_PageDist'] = clean_data['PageDist']/clean_data['Total']
clean_data['LineCount'] = clean_data['LineCount'].apply(lambda x: correct(x)) # threshold line count, min=1
clean_data['WordCount'] = clean_data['WordCount'].apply(lambda x: correct(x)) # threshold word count, min=1
clean_data['Avg_LineDist'] = clean_data['LineDist']/clean_data['LineCount']
clean_data['Avg_WordDist'] = clean_data['WordDist']/clean_data['WordCount']
# calcualte trainig features
X = clean_data[['Complexity','Avg_ChapterDist','Avg_PageDist','Avg_LineDist',
                               'Avg_WordDist', 'Chap_Con', 'Dad']]

out = clean_data[['ConceptA','ConceptB']]
out['pred'] = rfc.predict(X)
out.to_csv('pythondatasciencehandbook.csv', index=False)

In [22]:
filename = 'pythondatasciencehandbook_concepts_features.csv'
data = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','ChapterDist','PageDist',
                                       'LineDist','WordDist','Total','LineCount','WordCount','Complexity'])
filename = 'pythondatasciencehandbook_concepts_additional'
data_addn = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','Chap_Con','Dad'])

final_data = pd.merge(data, data_addn, on=['ConceptA','ConceptB'], how='inner')
data = final_data[['ConceptA','ConceptB','ChapterDist','PageDist',
                    'LineDist','WordDist','Total','LineCount','WordCount','Complexity','Chap_Con',
                    'Dad']]


clean_data = final_data
clean_data['Avg_ChapterDist'] = clean_data['ChapterDist']/clean_data['Total']
clean_data['Avg_PageDist'] = clean_data['PageDist']/clean_data['Total']
clean_data['LineCount'] = clean_data['LineCount'].apply(lambda x: correct(x)) # threshold line count, min=1
clean_data['WordCount'] = clean_data['WordCount'].apply(lambda x: correct(x)) # threshold word count, min=1
clean_data['Avg_LineDist'] = clean_data['LineDist']/clean_data['LineCount']
clean_data['Avg_WordDist'] = clean_data['WordDist']/clean_data['WordCount']
# calcualte trainig features
X = clean_data[['Complexity','Avg_ChapterDist','Avg_PageDist','Avg_LineDist',
                               'Avg_WordDist', 'Chap_Con', 'Dad']]

out = clean_data[['ConceptA','ConceptB']]
out['pred'] = rfc.predict(X)
out.to_csv('pythondatasciencehandbook.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
filename = 'Foundations of Data Science - Cornell CS_concepts_features.csv'
data = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','ChapterDist','PageDist',
                                       'LineDist','WordDist','Total','LineCount','WordCount','Complexity'])
filename = 'Foundations of Data Science - Cornell CS_concepts_additional'
data_addn = pd.read_csv('../results/'+filename, engine='python', 
                   header=None, names=['ConceptA','ConceptB','Chap_Con','Dad'])

final_data = pd.merge(data, data_addn, on=['ConceptA','ConceptB'], how='inner')
data = final_data[['ConceptA','ConceptB','ChapterDist','PageDist',
                    'LineDist','WordDist','Total','LineCount','WordCount','Complexity','Chap_Con',
                    'Dad']]


clean_data = final_data
clean_data['Avg_ChapterDist'] = clean_data['ChapterDist']/clean_data['Total']
clean_data['Avg_PageDist'] = clean_data['PageDist']/clean_data['Total']
clean_data['LineCount'] = clean_data['LineCount'].apply(lambda x: correct(x)) # threshold line count, min=1
clean_data['WordCount'] = clean_data['WordCount'].apply(lambda x: correct(x)) # threshold word count, min=1
clean_data['Avg_LineDist'] = clean_data['LineDist']/clean_data['LineCount']
clean_data['Avg_WordDist'] = clean_data['WordDist']/clean_data['WordCount']
# calcualte trainig features
X = clean_data[['Complexity','Avg_ChapterDist','Avg_PageDist','Avg_LineDist',
                               'Avg_WordDist', 'Chap_Con', 'Dad']]

out = clean_data[['ConceptA','ConceptB']]
out['pred'] = rfc.predict(X)
out.to_csv('Foundations_of_Data_Science_Cornell.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
# get all the outputs
out = clean_data[['ConceptA','ConceptB']]
out['pred'] = rfc.predict(X)
out.to_csv('networking_pred_data', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
feature_names = X.columns
tree_feature_importances = rfc.feature_importances_
sorted_idx = tree_feature_importances.argsort()

y_ticks = np.arange(0, len(feature_names))
fig, ax = plt.subplots()
ax.barh(y_ticks, tree_feature_importances[sorted_idx])
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances (MDI)")
fig.tight_layout()
plt.show()

In [None]:
def bucketize(x):
    buckets = list(range(-200,500,50))
    for i in buckets:
        if x < i:
            return(i)

clean_data['Avg_Dist_Bucket'] = clean_data['Avg_ChapterDist'].apply(lambda x: bucketize(x))

In [None]:
# clean_data_incorrect = clean_data[clean_data["Prereq"]!=clean_data['pred']]
# clean_data_correct = clean_data[clean_data["Prereq"]==clean_data['pred']]
# clean_data_fp = clean_data[ (clean_data["Prereq"]==0) & (clean_data["pred"]==1) ]
# clean_data_fn = clean_data[ (clean_data["Prereq"]==1) & (clean_data["pred"]==0) ]

In [None]:
# clean_data['fp'] = clean_data['Prereq'].apply(lambda x: x==0) & clean_data['pred'].apply(lambda x:x==1)

# grp_data = clean_data.groupby(clean_data['Avg_Dist_Bucket'])[['Prereq','fp']].sum().reset_index()
# grp_data['fp_precent'] = grp_data['fp']/grp_data['Prereq'] * 100
# grp_data

In [None]:
clean_data_ones = clean_data
grp_data = clean_data_ones.groupby(clean_data_ones['Avg_Dist_Bucket'])[['Prereq','pred']].sum()#.reset_index()
#grp_data['fn_precent'] = grp_data['fn']/grp_data['Prereq'] * 100
plt.figure(figsize=[10,5])
grp_data.plot.bar()
plt.xlabel('Average Chapter Distance')
plt.ylabel('Count of True Instances')
plt.title('AvgChapDist vs Count')

In [None]:
clean_data_ones = clean_data[clean_data['Prereq']==1]
plt.figure(figsize=[10,5])
plt.hist(clean_data_ones['Avg_PageDist'], color='g', bins=20)
plt.xlabel('Average Page Distance')
plt.ylabel('Count of Instances')
plt.title('AvgPageDist vs Count')

In [None]:
clean_data_fn = clean_data[(clean_data['Prereq']==1) & (clean_data['pred']==0)]
plt.figure(figsize=[10,5])
plt.hist(clean_data_fn['Avg_ChapterDist'], color='r', bins=20)
plt.xlabel('Average Chapter Distance')
plt.ylabel('Count of Instances')
plt.title('AvgChapDist vs Count')
plt.figure(figsize=(10,15))

In [None]:
clean_data_zeros = clean_data[clean_data['Prereq']==0]
plt.figure(figsize=[10,5])
plt.hist(clean_data_zeros['Avg_PageDist'], color='g', bins=20)
plt.xlabel('Average Page Distance')
plt.ylabel('Count of Instances')
plt.title('AvgPageDist vs Count')

In [None]:
clean_data_fp = clean_data[(clean_data['Prereq']==0) & (clean_data['pred']==1)]
plt.figure(figsize=[10,5])
plt.hist(clean_data_fp['Avg_PageDist'], color='r', bins=20)
plt.xlabel('Average Page Distance')
plt.ylabel('Count of Instances')
plt.title('AvgPageDist vs Count')
plt.figure(figsize=(10,15))

In [None]:
plt.figure(figsize=[10,5])
plt.hist(clean_data_fp['Avg_ChapterDist'], color='r')
plt.xlabel('Average Chapter Distance')
plt.ylabel('Count of Instances')
plt.title('AvgChapDist vs Count')
plt.figure(figsize=(10,15))

In [None]:
plt.figure(figsize=[10,5])
plt.hist(clean_data_fn['Avg_ChapterDist'], color='r')
plt.xlabel('Average Chapter Distance')
plt.ylabel('Count of Instances')
plt.title('AvgChapDist vs Count')
plt.figure(figsize=(10,15))

In [None]:
plt.figure(figsize=[10,5])
plt.hist(clean_data_correct[clean_data['Prereq'] ==1]['Avg_ChapterDist'], color='g')
plt.xlabel('Average Chapter Distance')
plt.ylabel('Count of Instances')
plt.title('AvgChapDist vs Count')
plt.figure(figsize=(10,15))

In [None]:
# logreg = LogisticRegression(max_iter=1000)
# logreg.fit(X_train, y_train)
# y_pred = logreg.predict(X_test)
# print("Lenght of train: ", len(X_train))
# print("Length of test: ", len(X_test))

# tn, fp, fn, tp  = confusion_matrix(y_test, y_pred).ravel()
# print(tn, fp, fn, tp)
# precision = tp / (tp+fp)
# recall = tp / (tp+fn)
# print("Precision : ", precision)
# print("Recall : ", recall)
# print("F1 Score : ", f1_score(y_test, y_pred))

In [None]:
# os = SMOTE(random_state=0)

# columns = X_train.columns
# os_data_X, os_data_y = os.fit_sample(X_train, y_train)

# # we can Check the numbers of our data
# print("length of oversampled data is ",len(os_data_X))
# print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['Prereq']==0]))
# print("Number of subscription",len(os_data_y[os_data_y['Prereq']==1]))

# logreg = LogisticRegression(max_iter=1000)
# logreg.fit(os_data_X, os_data_y)
# y_pred = logreg.predict(X_test)

# tn, fp, fn, tp  = confusion_matrix(y_test, y_pred).ravel()
# print(tn, fp, fn, tp)
# precision = tp / (tp+fp)
# recall = tp / (tp+fn)
# print("Precision : ", precision)
# print("Recall : ", recall)
# print("F1 Score : ", f1_score(y_test, y_pred))

In [None]:
# clean_data['pred'] = rfc.predict(X)
# clean_data[['ConceptA','ConceptB','pred']]
# clean_data[['ConceptA','ConceptB','pred']].to_csv('../results/pred_data',index=False)

In [None]:
# #Importing MLPClassifier
# from sklearn.neural_network import MLPClassifier

# #Initializing the MLPClassifier
# classifier = MLPClassifier(hidden_layer_sizes=(150,125,100,75,50,25), max_iter=1000, activation = 'relu',
#                                solver='adam',random_state=1)

# #Fitting the training data to the network
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)
# os = SMOTE(random_state=0)
# columns = X_train.columns
# os_data_X, os_data_y = os.fit_sample(X_train, y_train)
# classifier.fit(os_data_X, os_data_y)
# # classifier.fit(X_train, y_train)

# #Predicting y for X_val
# y_pred = classifier.predict(X_test)
# tn, fp, fn, tp  = confusion_matrix(np.array(y_test), np.array(y_pred)).ravel()
# print(tn, fp, fn, tp)
# precision = tp / (tp+fp)
# recall = tp / (tp+fn)
# print("Precision : ", precision)
# print("Recall : ", recall)
# print("F1 Score : ", f1_score(y_test, y_pred))

In [None]:
# #Svm model
# from sklearn import svm
# clf = svm.SVC(kernel='linear') # Linear Kernel

# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)
# clf.fit(X_train, y_train)
# #Predict the response for test dataset
# y_pred = clf.predict(X_test)

# tn, fp, fn, tp  = confusion_matrix(np.array(y_test), np.array(y_pred)).ravel()
# print(tn, fp, fn, tp)
# precision = tp / (tp+fp)
# recall = tp / (tp+fn)
# print("Precision : ", precision)
# print("Recall : ", recall)
# print("F1 Score : ", f1_score(y_test, y_pred))