#Functions

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!python /content/drive/MyDrive/child-level-data.py

         Domain        area  ... child_values_2.1                                             fields
0  biochemistry    Genetics  ...               5%  ['Statistical Genetics', 'Genetic Epidemiology...
1  biochemistry    Genetics  ...               3%                           ['pancreas and biliary']
2       Medical      Cancer  ...              14%                                         ['cancer']
3  biochemistry  Enzymology  ...              18%  ['Physical Chemistry', 'Theoretical Chemistry'...
4       Medical    Medicare  ...              15%  ['Circadian Rhythms in Heart Ischemia and Acut...

[5 rows x 15 columns]
biochemistry
Medical
ECE
Psychology
Civil
MAE
CS


In [3]:
#SKLEARN LINRARIES
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split,KFold

#NLTK LIBRARIES
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize,RegexpTokenizer
nltk.download('stopwords')
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words=stopwords.words("english")
from nltk.stem import PorterStemmer

from bs4 import BeautifulSoup
from IPython.display import clear_output
import numpy as np
import pandas as pd
import string
from matplotlib import pyplot as plt
from collections import Counter

#Tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def Progress(string):
    global iterator
    global number_of_records
    percent=((iterator/number_of_records)*10)

    if percent%5==0:
        clear_output(wait=True)
        print(string+":"+str(percent)+"%")

    iterator=iterator+1

def remove_strip(text):
    return text.strip()

def remove_noise(text):


    soup=BeautifulSoup(text,"html.parser")
    text=soup.get_text()
    return text

def remove_punc(text):

    rem_punc=RegexpTokenizer(r'\w+')
    return ' '.join(rem_punc.tokenize(text))

def remove_stopwords(text):


    words=word_tokenize(text)
    return ' '.join([w.lower() for w in words if not w.lower() in stop_words])

def stemming(text):


    words=word_tokenize(text)
    porter=PorterStemmer()
    return ' '.join([porter.stem(w) for w in words ])



def Vocab(text):
  global freqdist
  words=word_tokenize(text)
  vocab.update(nltk.FreqDist(words).keys())

def build_vocab(documents):

  #maing vocab
  documents.apply(Vocab)

  return vocab

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
import numpy as np
import pandas as pd
import string
import os

areas=["CS","Civil","ECE","MAE","Medical","Psychology","biochemistry"]
child_data={}

for area in areas:
  #Make Directory For Child-Level
  if os.path.exists(f'/content/drive/MyDrive/{area}') == False:
    os.mkdir(f'/content/drive/MyDrive/{area}')


  child_data[area]=pd.read_csv(f"/content/{area}.csv")
  child_data[area].head() #show the head of the dataset
data=pd.read_csv(f'/content/drive/MyDrive/Science Survey.csv')

In [5]:
child_data["CS"]

Unnamed: 0,index,fields,Domain
0,86,"['Statistics', 'Biostatistics', 'Human-data in...",Data structures
1,87,['Medical Image Analysis and Informatics'],Image processing
2,108,"['psychometrics', 'IRT', 'longitudinal data an...",Data structures
3,119,"['Multimedia networks and protocols', 'Securit...",Machine learning
4,125,"['wireless networking', 'algorithms', 'theory']",Algorithm design
...,...,...,...
10973,85241,"['Data mining', 'Software engineering']",Software engineering
10974,85254,"['Computer Graphics', 'Computer animation', 'V...",Computer graphics
10975,85265,"['Linguistics', 'Stylistics', 'Russian Languag...",Symbolic computation
10976,85273,"['Artificial Intelligence', 'Metaheuristic', '...",Computer programming


#Preprocessing


In [6]:
#before preprocessing
print("------------------------------------------------")
print(data["fields"][0])
print("------------------------------------------------")

#removeing puctuation
iterator=1
data['fields']=data['fields'].apply(remove_punc)

#removeing stopwords
iterator=1
data['fields']=data['fields'].apply(remove_stopwords)

#after preprocessing
print("------------------------------------------------")
print(data["fields"][0])
print("------------------------------------------------")

------------------------------------------------
['Statistical Genetics', 'Genetic Epidemiology', 'Genetics of Gene Expression', 'Variance Components Methods in Genetics', 'Multivariate L']
------------------------------------------------
------------------------------------------------
statistical genetics genetic epidemiology genetics gene expression variance components methods genetics multivariate l
------------------------------------------------


#Tokenizer

In [7]:
#number of distince frequency in dataset
print('--------------------------------------------------------------------------------------------------------')
vocab=Counter()
freqdist=len(build_vocab(data["fields"]))
print("Number Frequency "+str(freqdist))
print('--------------------------------------------------------------------------------------------------------')

# Tokenize our training data
print('--------------------------------------------------------------------------------------------------------')
print("Tokenizing...")
tokenizer = Tokenizer(freqdist-200)
tokenizer.fit_on_texts(data['fields'])
# Encode training data sentences into sequences
sequences = tokenizer.texts_to_sequences(data['fields'],)

# Get our training data word index
word_to_index = tokenizer.word_index

# Get max training sequence length
maxlen = max([len(x) for x in sequences])

#Pad the training sequences
sequences_padded = pad_sequences(sequences,maxlen=maxlen)

# Output the results of our work
print("Word index:\n",len(word_to_index))
print("\nSequences:", sequences[0])
print("\nSequences Length:", len(sequences[0]))
print("\nPadded Sequences:", sequences_padded[0])
print("\nPadded Sequences Length:", len(sequences_padded[0]))
print('--------------------------------------------------------------------------------------------------------')

--------------------------------------------------------------------------------------------------------
Number Frequency 34674
--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------
Tokenizing...
Word index:
 34629

Sequences: [178, 36, 349, 65, 36, 307, 788, 11036, 2797, 116, 36, 1414, 1071]

Sequences Length: 13

Padded Sequences: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   178    36   349    65
    36   307   788 11036  2797   116    36  1414  1071]

Padded Sequences Length: 33
--------------------------------------------------------------------------------------------------------


In [8]:
print(data["fields"][87])
print(word_to_index["image"])
print(sequences_padded[87])

medical image analysis informatics
50
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0  55  50  15 286]


#Model

In [9]:
from sklearn.preprocessing import LabelBinarizer

parent_label=LabelBinarizer()
parent_label.fit_transform(data["Domain"])

child_labels={}
for area in areas:
  child_labels[area]=LabelBinarizer()
  child_labels[area].fit_transform(child_data[area]["Domain"])

#Prediction

In [20]:
from keras.saving import load_model
parent_model=load_model("/content/drive/MyDrive/Parent_Science_Survey/parent_level_rnn_GoogleNews-vectors.keras")
test_predict=[]
test_predict.append(sequences_padded[87])
test_predict=np.array(test_predict)
prediction = parent_model.predict(test_predict)

parent_label.classes_[np.argmax(prediction,axis=1)[0]]



'CS'

In [23]:
child_model=load_model(f"/content/drive/MyDrive/Science Survey/CS/child_level_rnn_GoogleNews-vectors.keras")
test_predict=[]
test_predict.append(sequences_padded[10])
test_predict=np.array(test_predict)
prediction = child_model.predict(test_predict)


print(child_labels["CS"].classes_[np.argmax(prediction,axis=1)[0]])



Symbolic computation


#Evaluating Word Embedding

In [17]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from keras.saving import load_model
import joblib

parent_deep_learning_models=["rnn"]
child_deep_learning_models=["dnn","rnn"]
# word_embeddings=['glove.50d', 'glove.100d', 'glove.200d', 'glove.300d', 'GoogleNews-vectors', 'FastText']
word_embeddings=['GoogleNews-vectors']
Excel_Output=pd.DataFrame(columns=["first-level","second-level","word_embedding","tp","tn","fp","fn","accuracy","recall","precision","hP","hR"])

for parent_deep_learning_model in parent_deep_learning_models:
  for word_embedding in word_embeddings:

    #parent-level predict
    #-------------------------------------------------------------------------------
    if parent_deep_learning_model=="dnn":

      parent_model=load_model(f"/content/drive/MyDrive/Data/parent_level_dnn.keras")
      vocabulary = joblib.load("/content/drive/MyDrive/Data/vectorizer.pkl")
      cv=CountVectorizer(binary=False,ngram_range=(1,3),vocabulary=vocabulary)
      test_predict=cv.transform(data['fields'])

    else:

      parent_model=load_model(f"/content/drive/MyDrive/Parent_Science_Survey/parent_level_{parent_deep_learning_model}_{word_embedding}.keras")
      test_predict=[]
      for index in range(0,len(sequences_padded)):
        test_predict.append(sequences_padded[index])
      test_predict=np.array(test_predict)

    parent_predictions=parent_model.predict(test_predict)

    parent_predictions=np.argmax(parent_predictions,axis=1)
    parent_predictions=[parent_label.classes_[predict].strip() for predict in parent_predictions]
    #-------------------------------------------------------------------------------

    for child_deep_learning_model in child_deep_learning_models:
      predicted_paths=[]

      #child-level predict
      #-------------------------------------------------------------------------------
      for area in areas:

        if child_deep_learning_model=="dnn":
          data_cv=[]
          indexs=[]

          for index in range(0,len(parent_predictions)):
            if parent_predictions[index]==area:
              data_cv.append(data['fields'][index])
              indexs.append(index)

          child_model=load_model(f"/content/drive/MyDrive/Science Survey/{area}/child_level_dnn.keras")
          vocabulary = joblib.load(f"/content/drive/MyDrive/Science Survey/{area}/vectorizer.pkl")
          cv=CountVectorizer(binary=False,ngram_range=(1,3),vocabulary=vocabulary)
          test_predict=cv.transform(data_cv)

        else:
          sequences_padded_test=[]
          indexs=[]

          for index in range(0,len(parent_predictions)):
            if parent_predictions[index]==area:
              sequences_padded_test.append(sequences_padded[index])
              indexs.append(index)

          child_model=load_model(f"/content/drive/MyDrive/Science Survey/{area}/child_level_{child_deep_learning_model}_{word_embedding}.keras")

          test_predict=[]
          for index in range(0,len(sequences_padded_test)):
            test_predict.append(sequences_padded_test[index])
          test_predict=np.array(test_predict)


        child_predictions=child_model.predict(test_predict)


        child_predictions=np.argmax(child_predictions,axis=1)
        child_predictions=[child_labels[area].classes_[predict].strip() for predict in child_predictions]

        for i in range(0,len(child_predictions)):
          predicted_paths.append([indexs[i],area,child_predictions[i]])

      #-------------------------------------------------------------------------------

      #Evaluating The Hirearchical Classification
      #-------------------------------------------------------------------------------
      predicted_paths=pd.DataFrame(predicted_paths,columns=["index","Domain","area"])
      predicted_paths=pd.DataFrame(predicted_paths.to_numpy(),index=predicted_paths["index"],columns=["index","Domain","area"]).sort_index()
      predicted_paths=predicted_paths.drop(columns=["index"])

      true_paths=data.loc[:,["Domain","area"]]

      tp=0
      tn=0
      fp=0
      fn=0
      for index in range(0,len(true_paths)):
        #TN
        #------------------------------------------------------------------
        true_set=[label.strip() for label in parent_label.classes_]
        true_set.remove(true_paths["Domain"][index].strip())

        predicted_set=[label.strip() for label in parent_label.classes_]
        predicted_set.remove(predicted_paths["Domain"][index].strip())

        intersect_set=list(set(true_set) & set(predicted_set))
        tn=tn+len(intersect_set)



        true_set=[label.strip() for label in child_labels[true_paths["Domain"][index].strip()].classes_]
        true_set.remove(true_paths["area"][index].strip())

        predicted_set=[label.strip() for label in child_labels[predicted_paths["Domain"][index].strip()].classes_]
        predicted_set.remove(predicted_paths["area"][index].strip())

        intersect_set=list(set(true_set) & set(predicted_set))
        tn=tn+len(intersect_set)
        #--------------------------------------------------------------

        #------------------------------------------------------------------
        predicted_path=set([predicted_paths["Domain"][index].strip(),predicted_paths["area"][index].strip()])
        true_path=set([true_paths["Domain"][index].strip(),true_paths["area"][index].strip()])

        common_path=predicted_path &  true_path

        # print(predicted_path)
        # print(true_path)
        # print(common_path)

        tp=tp+len(common_path)
        # print(tp)
        fn=fn+(len(true_path-common_path))
        # print(fn)
        fp=fp+(len(predicted_path-common_path))
        # print(fp)
        #------------------------------------------------------------------
      accuracy=((tp+tn)/(tp+tn+fp+fn))*100
      recall=(tp)/(tp+fn)*100
      precision=(tp)/(tp+fp)*100

      hR=(tp/(46985*2))*100
      hP=(tp/(46985*2))*100

      print("#---------------------------")
      print(parent_deep_learning_model,child_deep_learning_model,word_embedding)
      print("accuracy",accuracy)
      print("recall",recall)
      print("precision",precision)

      print("hirearchical recall",(tp/(85284*2))*100)
      print("hirearchical precision",(tp/(85284*2))*100)

      print("#---------------------------")

      Excel_Output=pd.concat([Excel_Output,pd.DataFrame([{"first-level":parent_deep_learning_model,"second-level":child_deep_learning_model,"word_embedding":word_embedding,"tp":tp,"tn":tn,"fp":fp,"fn":fn,"accuracy":accuracy,"recall":recall,"precision":precision,"hP":hP,"hR":hR}])])
    #-------------------------------------------------------------------------------
Excel_Output.to_csv("/content/drive/MyDrive/Evaluate_OUTPU.csv")

#---------------------------
rnn dnn GoogleNews-vectors
accuracy 98.70901732823283
recall 91.87010459171708
precision 91.87010459171708
hirearchical recall 91.87010459171708
hirearchical precision 91.87010459171708
#---------------------------
#---------------------------
rnn rnn GoogleNews-vectors
accuracy 98.27155068882885
recall 89.11519159514094
precision 89.11519159514094
hirearchical recall 89.11519159514094
hirearchical precision 89.11519159514094
#---------------------------
