<a href="https://colab.research.google.com/github/Ihsan1331/document-tagging/blob/main/Multilabel_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Load Package

In [48]:
import tensorflow as tf #deep learning framework
import pandas as pd #load dataset
import ast #change string into list
from collections import Counter #to trace the amount of samples each class
import plotly.express as px #data visualization

# 2. Load Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive/', force_remount = True)

Mounted at /content/drive/


In [117]:
df = pd.read_csv(r'/content/drive/MyDrive/ML/Multilabel Classification/arxiv_data_210930-054931.csv')

# 3. Exploratory Data Analysis

In [6]:
df.head()

Unnamed: 0,terms,titles,abstracts
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...
2,"['cs.LG', 'cs.CR', 'stat.ML']",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...
3,"['cs.LG', 'cs.CR']",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...
4,['cs.LG'],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...


In [7]:
# there are 3 features, representing the class, titles, and abstract
df.columns

Index(['terms', 'titles', 'abstracts'], dtype='object')

In [8]:
# there are 56.181 samples
df.shape

(56181, 3)

In [29]:
# it seems that the terms feature is saved as string, we need to change it back into list or array
print(df.terms[2])
type(df.terms[2])

['cs.LG', 'cs.CR', 'stat.ML']


str

In [122]:
# change all elements in df.terms from string into list
df['terms'] = df.terms.apply(ast.literal_eval)

In [33]:
df.head()

Unnamed: 0,terms,titles,abstracts
0,[cs.LG],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"[cs.LG, cs.AI]",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...
2,"[cs.LG, cs.CR, stat.ML]",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...
3,"[cs.LG, cs.CR]",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...
4,[cs.LG],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...


In [125]:
unique_class = []
# listing all unique classes
for i in df.terms:
  for j in i:
    if j not in unique_class:
      unique_class.append(j)
    else:
      pass

In [126]:
# there are so many unique class, there are 1177 unique class in this dataset
print(unique_class)
len(unique_class)

['cs.LG', 'cs.AI', 'cs.CR', 'stat.ML', 'cs.DC', 'cs.IT', 'math.IT', 'physics.data-an', 'cs.SI', 'cs.DS', '68T30', 'I.5.4', 'cs.CG', 'q-bio.QM', 'cs.CV', 'cs.CL', '68T05', 'cond-mat.dis-nn', 'cond-mat.stat-mech', 'cs.SD', 'eess.AS', 'cs.IR', 'I.2.6', 'cs.SY', 'eess.SY', '68T45 (Primary) 68T10, 68T07 (Secondary)', 'I.4.9; I.5.4; I.2.10', '68T07, 68T30, 68R99', 'I.2.0; I.2.4', 'cs.NA', 'math.NA', '68T07, 05C85, 42C40', 'I.2.4; I.2.6', 'math.ST', 'stat.TH', '62H30 (Primary) 54F45 (Secondary)', 'cs.DM', 'G.1.6; I.2.6', 'eess.SP', 'q-bio.NC', 'quant-ph', '68Txx, 81Pxx', 'I.2', 'cs.DB', 'math.AT', 'math.OC', 'eess.IV', 'cs.AR', 'cs.MM', 'physics.app-ph', 'physics.chem-ph', 'physics.optics', 'stat.CO', '62G08', 'cs.NI', 'stat.AP', 'cs.SE', 'cs.NE', 'econ.GN', 'q-fin.EC', 'cs.MS', 'math.GR', 'math.OA', 'cs.RO', '05C99, 62M45', 'G.2.2', 'stat.ME', 'hep-ex', 'cs.GR', 'math.MG', '42C40, 05C85, 11Y16', 'math.DS', 'math.PR', '62-07, 37H99', 'math.SP', 'math.FA', '37N99, 46E22, 47B32', 'astro-ph.IM',

1177

In [82]:
class_counter = Counter()
# counting every class by iterating through df['terms]
for row in df['terms']:
    class_counter.update(row)
class_counter = pd.DataFrame(class_counter.items(), columns=['class','count'])

# due to there are some class name that very very long we truncated into 10 characters only
trunc =[]
for i in range (0, class_counter['class'].shape[0]):
  j = class_counter['class'][i]
  if len(j) >=10:
    trunc.append(j[:10])
  else:
    trunc.append(j)

class_counter['trunc'] = trunc

In [83]:
class_counter.head()

Unnamed: 0,class,count,trunc
0,cs.LG,30939,cs.LG
1,cs.AI,8390,cs.AI
2,cs.CR,739,cs.CR
3,stat.ML,16570,stat.ML
4,cs.DC,279,cs.DC


In [84]:
fig = px.bar(class_counter, x='trunc', y='count')
fig.show()

In [88]:
# The highest amount of samples is cs.CV with 33k samples
# many of the classes have minimal samples and not even have 1k samples
# for now we will take only the class that have more than equal 500 samples

class_cleaned = class_counter.loc[class_counter['count'] >= 500]
class_cleaned.head()

Unnamed: 0,class,count,trunc
0,cs.LG,30939,cs.LG
1,cs.AI,8390,cs.AI
2,cs.CR,739,cs.CR
3,stat.ML,16570,stat.ML
8,cs.SI,684,cs.SI


In [107]:
# there are only 14 class with more than 500
class_cleaned.shape

(14, 3)

In [108]:
# we need to change the dataset into only have the 14 classes
list_classes = list(class_cleaned['class'])
list_classes

['cs.LG',
 'cs.AI',
 'cs.CR',
 'stat.ML',
 'cs.SI',
 'cs.CV',
 'cs.CL',
 'eess.SP',
 'math.OC',
 'eess.IV',
 'cs.MM',
 'cs.NE',
 'cs.RO',
 'cs.GR']

In [132]:
# deleting all classes that are not included in list_classes
for i in df['terms']:
  for j in i:
    if j not in list_classes:
      i.remove(j)

In [136]:
# there are only 14 classes in dataset now
unique_class_clean = []
# listing all unique classes
for i in df.terms:
  for j in i:
    if j not in unique_class_clean:
      unique_class_clean.append(j)
    else:
      pass
unique_class_clean

['cs.LG',
 'cs.AI',
 'cs.CR',
 'stat.ML',
 'cs.SI',
 'cs.CV',
 'cs.CL',
 'eess.SP',
 'math.OC',
 'eess.IV',
 'cs.MM',
 'cs.NE',
 'cs.RO',
 'cs.GR']

In [137]:
# checking if by deleting classes is there any missing value
df.isna().sum()
# there are no missing value

terms        0
titles       0
abstracts    0
dtype: int64

In [146]:
# because this is multi label problem, we need to change the class from ['class a', 'class b'] into separate feature
# similar to one hot encoding. We will create a new DataFrame called df_class that will create new feature based on the 14 classes
df_class = pd.DataFrame([], columns = unique_class_clean)

#next is merge the 2 dataframe into 1 dataframe
df_processed = pd.concat([df,df_class], axis = 1)

df_processed.head()

Unnamed: 0,terms,titles,abstracts,cs.LG,cs.AI,cs.CR,stat.ML,cs.SI,cs.CV,cs.CL,eess.SP,math.OC,eess.IV,cs.MM,cs.NE,cs.RO,cs.GR
0,[cs.LG],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...,,,,,,,,,,,,,,
1,"[cs.LG, cs.AI]",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...,,,,,,,,,,,,,,
2,"[cs.LG, cs.CR, stat.ML]",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...,,,,,,,,,,,,,,
3,"[cs.LG, cs.CR]",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...,,,,,,,,,,,,,,
4,[cs.LG],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...,,,,,,,,,,,,,,


In [147]:
#iterating each sample to change the NaN into 1 if the class is available for that sample
for i in range(0, df_processed.shape[0]):
  for j in df['terms'][i]:
    df_processed[j][i] = 1

In [148]:
# checking if the processed succesfully or not
df_processed.head()

Unnamed: 0,terms,titles,abstracts,cs.LG,cs.AI,cs.CR,stat.ML,cs.SI,cs.CV,cs.CL,eess.SP,math.OC,eess.IV,cs.MM,cs.NE,cs.RO,cs.GR
0,[cs.LG],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...,1,,,,,,,,,,,,,
1,"[cs.LG, cs.AI]",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...,1,1.0,,,,,,,,,,,,
2,"[cs.LG, cs.CR, stat.ML]",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...,1,,1.0,1.0,,,,,,,,,,
3,"[cs.LG, cs.CR]",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...,1,,1.0,,,,,,,,,,,
4,[cs.LG],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...,1,,,,,,,,,,,,,


In [149]:
# change from NaN into 0
df_processed = df_processed.fillna(0)
df_processed.head()

Unnamed: 0,terms,titles,abstracts,cs.LG,cs.AI,cs.CR,stat.ML,cs.SI,cs.CV,cs.CL,eess.SP,math.OC,eess.IV,cs.MM,cs.NE,cs.RO,cs.GR
0,[cs.LG],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"[cs.LG, cs.AI]",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,"[cs.LG, cs.CR, stat.ML]",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...,1,0,1,1,0,0,0,0,0,0,0,0,0,0
3,"[cs.LG, cs.CR]",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...,1,0,1,0,0,0,0,0,0,0,0,0,0,0
4,[cs.LG],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...,1,0,0,0,0,0,0,0,0,0,0,0,0,0
