In [1]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [2]:
from google.colab import drive
drive.mount('/gdrive')

project_folder = '/gdrive/MyDrive/ProjectTCNER'

Mounted at /gdrive


In [3]:
import os
import pandas as pd
import plotly.express as px

data_folder = os.path.join(project_folder, "data")
trainset_file = os.path.join(data_folder, "DBLPTrainset.txt")
testset_file = os.path.join(data_folder, "DBLPTestset.txt")
ground_truth_file = os.path.join(data_folder, "DBLPTestGroundTruth.txt")

In [4]:
out_folder = os.path.join(project_folder, "out")

In [5]:
dataset_train = pd.read_table(os.path.join(trainset_file), sep="\t", header=None, names=['y', 'X'],
                              index_col=0)
dataset_train

Unnamed: 0,y,X
0,ISCAS,Scalable Serial-parallel Multiplier over GF(2m...
1,SIGGRAPH,Plenoptic sampling.
2,ISCAS,Sensitivity and uniformity of a 0.18micrometer...
3,WWW,A survey of web archive search architectures.
4,ISCAS,Understanding dynamic behavior of mm-wave CML ...
...,...,...
21638,ISCAS,Decoding a Family of Dense Codes using the Sum...
21639,VLDB,CoHadoop: Flexible Data Placement and Its Expl...
21640,ISCAS,Full system simulation with QEMU: An approach ...
21641,INFOCOM,Localization in non-localizable sensor and ad-...


In [6]:
dataset_test = pd.read_table(os.path.join(testset_file), sep="\t", header=None, names=['X'],
                              index_col=0)
dataset_test

Unnamed: 0,X
0,Fast recursive adaptation for nonlinear filters.
1,High-Throughput Data Compressor Designs Using ...
2,Functional Verification of ECL Circuits Includ...
3,Efficient network generation under general pre...
4,Creating the earth as a backdrop in <i>Gravity...
...,...
3368,GEM: A Geometric Algorithm for Scheduling.
3369,On the geographic patterns of a large-scale mo...
3370,NScale: Neighborhood-centric Analytics on Larg...
3371,Sufficient Conditions for Finding Multiple Ope...


In [7]:
dataset_gt = pd.read_table(os.path.join(ground_truth_file), sep="\t", header=None, names=['y'],
                              index_col=0)
dataset_gt

Unnamed: 0,y
0,ISCAS
1,ISCAS
2,ISCAS
3,WWW
4,SIGGRAPH
...,...
3368,ISCAS
3369,INFOCOM
3370,VLDB
3371,ISCAS


In [8]:
dataset_test = pd.merge(dataset_test, dataset_gt, left_index=True, right_index=True)
dataset_test

Unnamed: 0,X,y
0,Fast recursive adaptation for nonlinear filters.,ISCAS
1,High-Throughput Data Compressor Designs Using ...,ISCAS
2,Functional Verification of ECL Circuits Includ...,ISCAS
3,Efficient network generation under general pre...,WWW
4,Creating the earth as a backdrop in <i>Gravity...,SIGGRAPH
...,...,...
3368,GEM: A Geometric Algorithm for Scheduling.,ISCAS
3369,On the geographic patterns of a large-scale mo...,INFOCOM
3370,NScale: Neighborhood-centric Analytics on Larg...,VLDB
3371,Sufficient Conditions for Finding Multiple Ope...,ISCAS


In [None]:
dataset_whole = pd.concat([dataset_train, dataset_test])
dataset_whole

Unnamed: 0,y,X
0,ISCAS,Scalable Serial-parallel Multiplier over GF(2m...
1,SIGGRAPH,Plenoptic sampling.
2,ISCAS,Sensitivity and uniformity of a 0.18micrometer...
3,WWW,A survey of web archive search architectures.
4,ISCAS,Understanding dynamic behavior of mm-wave CML ...
...,...,...
3368,ISCAS,GEM: A Geometric Algorithm for Scheduling.
3369,INFOCOM,On the geographic patterns of a large-scale mo...
3370,VLDB,NScale: Neighborhood-centric Analytics on Larg...
3371,ISCAS,Sufficient Conditions for Finding Multiple Ope...


Check for missing values

In [None]:
dataset_whole.isna().any()

y    False
X    False
dtype: bool

Inspect categories

In [9]:
cat_freq = px.histogram(
    dataset_train,
    x='y',
    template='simple_white',
    color_discrete_sequence=['#397d5a'])
cat_freq.update_layout(
    title={
        'text': 'Number of samples per category'.upper(),
        'x': 0.5,
        'font': {
            'size': 30
        }
    },
    font={
        'family': 'Barlow, sans-serif',
        'size': 16
    },
    yaxis_title='COUNT',
    xaxis_title='',
    width=500*2,
    height=400*2
)
cat_freq.update_xaxes(
    categoryorder='total ascending'
)
cat_freq.show()

In [43]:
cat_freq.write_image(file=os.path.join(out_folder, 'category_count.jpg'), scale=2)

In [16]:
info_df = dataset_train
info_df['text_length'] = info_df.X.str.len()
info_df['text_words'] = info_df.X.str.split().apply(lambda x: len(x))
info_df = pd.melt(info_df, id_vars=['X', 'y'], var_name='type', value_name='val')

In [50]:
shortest_lengths = info_df.loc[info_df['val'] <= 2].sort_values('val')
shortest_lengths

Unnamed: 0,X,y,type,val
33586,Oxygen.,SIGGRAPH,text_words,1
27975,Sponsors.,VLDB,text_words,1
27097,Wake-on-WLAN.,WWW,text_words,1
36793,Oneironaut.,SIGGRAPH,text_words,1
26996,Tengible.,SIGGRAPH,text_words,1
...,...,...,...,...
32245,Database Machines.,VLDB,text_words,2
32246,Cloud 21.,SIGGRAPH,text_words,2
32296,Schema Summarization.,VLDB,text_words,2
30878,Constraint SVG.,WWW,text_words,2


In [45]:
lengths_distrib_plot = px.box(info_df, x='y', y='val',
                              color='type',
                              points='all',
                              facet_row='type',
                              template='gridon'
                              )
lengths_distrib_plot.update_layout(
    height=800,
    showlegend=False,
    font={
        'family': 'Barlow, sans-serif',
        'size': 16
    },
    title=dict(
        text='Distribution of lengths per category',
        x=0.5,
        font=dict(size=30)
    )
)
lengths_distrib_plot.update_yaxes(
    matches=None
)
lengths_distrib_plot.for_each_yaxis(lambda axis: axis.update(showline=True,
                                                             mirror=True))
lengths_distrib_plot.for_each_yaxis(lambda axis: axis.update(
    title='Number of characters', range=[0, 250]),
    row=2)
lengths_distrib_plot.for_each_yaxis(
    lambda axis: axis.update(title='Number of words', range=[0, 35]),
    row=1)
lengths_distrib_plot.update_xaxes(
    title=''
)
lengths_distrib_plot.for_each_xaxis(lambda axis: axis.update(showline=True,
                                                             mirror=True))
lengths_distrib_plot.for_each_annotation(lambda a: a.update(text=''))
lengths_distrib_plot.show()

In [46]:
lengths_distrib_plot.write_image(file=os.path.join(out_folder, 'lengths_distrib.jpg'), scale=2)

In [None]:
next(info_df.iterrows())

(0,
 X       Scalable Serial-parallel Multiplier over GF(2m...
 y                                                   ISCAS
 type                                          text_length
 val                                                   102
 Name: 0, dtype: object)

In [None]:
from collections import Counter

word_freqs = Counter()

for _, row in info_df.iterrows():
  text = row['X']
  words = text.split()
  word_freqs.update(words)

word_freqs_df = pd.DataFrame.from_dict(word_freqs, orient='index', columns=['count'])
word_freqs_df.sort_values(by='count', inplace=True)

In [None]:
word_freq_plot_most_uncommon = px.bar(word_freqs_df.iloc[:30], x='count',
                        y=word_freqs_df.iloc[:30].index, orientation='h')
word_freq_plot_most_uncommon.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
    height=750
)
word_freq_plot_most_uncommon.show()

In [None]:
word_freq_plot_most_common = px.bar(word_freqs_df.iloc[-30:], x='count',
                        y=word_freqs_df.iloc[-30:].index, orientation='h')
word_freq_plot_most_common.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
    height=750
)
word_freq_plot_most_common.show()