In [2]:
!unzip '/content/drive/MyDrive/NLP/nips-papers.zip'

Archive:  /content/drive/MyDrive/NLP/nips-papers.zip
  inflating: authors.csv             
  inflating: database.sqlite         
  inflating: paper_authors.csv       
  inflating: papers.csv              


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing
df = pd.read_csv('/content/papers.csv')

# **Exploring the dataset**

In [6]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [7]:
df.shape

(7241, 7)

In [8]:
df['id']

0          1
1         10
2        100
3       1000
4       1001
        ... 
7236     994
7237     996
7238     997
7239     998
7240     999
Name: id, Length: 7241, dtype: int64

In [9]:
df['id'][0]

1

In [10]:
df['paper_text'][0]

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

In [11]:
df['abstract'][0]

'Abstract Missing'

In [12]:
df['title'][0]

'Self-Organization of Associative Database and Its Applications'

# **Preprocessing the textual data**

In [16]:
import re
import nltk
# from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# Creating a list of custom stopwords
new_words = ['fig', 'figure', 'image', 'sample', 'using', 'show', 'result', 'large', 'also', 'one', 'two',
             'three', 'four', 'five', 'seven', 'eight', 'nine']
stop_words = list(stop_words.union(new_words))

def pre_process(text):
    # Lowercase
    text = text.lower()

    # Remove tags
    text = re.sub('&lt;/?.*?&gt;', ' &lt;&gt; ', text)

    # Remove special characters and digits
    text = re.sub('(\\d|\\W)+', ' ', text)

    # Convert into list
    text = text.split()

    # Remove stopwords
    text = [w for w in text if w not in stop_words]

    # Remove words less than three letters
    text = [w for w in text if len(w) >= 3]

    # Lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(w) for w in text]

    return ' '.join(text)
df['paper_text_preprocessed'] = df['paper_text'].apply(lambda x: pre_process(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
df['paper_text'].shape

(7241,)

In [18]:
df['paper_text_preprocessed'].shape

(7241,)

In [19]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text,paper_text_preprocessed
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,self organization associative database applica...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,mean field theory layer visual cortex applicat...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,storing covariance associative long term poten...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...,bayesian query construction neural network mod...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a...",neural network ensemble cross validation activ...


In [5]:
df['paper_text'][0]

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

In [20]:
df['paper_text_preprocessed'][0]

'self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding s

# **generate the word count vector**

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95,        # ignore words that appear in 95% of documents
                     max_features=10000, # the size of the vocabulary
                     ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                     ) 
word_count_vector = cv.fit_transform(df['paper_text_preprocessed'])

# **calculate the reverse frequency of documents**

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [39]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    # get the feature names and tf-idf score of the top n items
    # use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        # keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    # create a tuple of feature,score
    # results = zip(feature_vals, score_vals)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    return results

    # get feature names
feature_names = cv.get_feature_names_out()

def get_keywords(idx, docs):
        # generate tf-idf for the given document
    tf_idf_vector = tfidf_transformer.transform(cv.transform([df['paper_text_preprocessed'][idx]]))

        # sort the tf-idf vectors by descending order of scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())

        # extract only the top n; n here is 10
    keywords = extract_topn_from_vector(feature_names, sorted_items, 10)

    return keywords

def print_results(idx, keywords, df):
    t = df['title'][idx]
    a = df['abstract'][idx]
    c = [k for k in keywords]
    print(f'\n===Title===:\n{t}\n===Abstract===:\n{a}\n===Keywords===:\n{c}')

idx = 941
keywords = get_keywords(idx, df['paper_text_preprocessed'])
print_results(idx, keywords, df)


===Title===:
Algorithms for Non-negative Matrix Factorization
===Abstract===:
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 
===Keywords===:
['update rule', 'update', 'auxiliary', 'non negative matrix', 'negative matrix', 'rule', 'nmf', 'multiplicative', 'matrix factorization', 'mat