## Data Preparation

In [1]:
import os
import pickle
from pprint import pprint
from os.path import join as JP

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml, ensure_directories

from scripts.catalog import (
    Catalog, Document, Corpus,
    load_catalog, load_corpus)

config = parse_yaml('config.yaml')
paths = config['paths']
ensure_directories(paths)

In [2]:
catalog = Catalog()

## Load Catalog

In [3]:
catalog = load_catalog(path=paths['catalog'],name='spacy_pipeline_on_US_corpus')
# catalog.documents[0].sp_doc

## Load Corpus into Catalog

In [54]:
corpus = load_corpus(path=paths['catalog'], name='only_en_countries')
catalog.load_corpus(corpus=corpus)
# catalog.save(path=paths['catalog'],name='test1_clean')

'[OK] Corpus loaded into Catalog'

### Filter down the catalog

In [4]:
OF_INTEREST = ['US'] # ['CA','AU']
filters = dict(
    topic = ['isocyanate'],
    country = OF_INTEREST,
    raw_text_len = 5000)

sub_catalog = catalog.filter_catalog(filters)
sub_catalog.documents = sub_catalog.documents[:15]
print('Catalog recuded from {} to {}'.format(
    len(catalog.documents), len(sub_catalog.documents)))

Catalog recuded from 15 to 15


### SPLIT CATALOG INTO THE TWO CATEGORIES

In [5]:
filters = dict(label='relevant')
pos_catalog = sub_catalog.filter_catalog(filters)

filters = dict(label='irrelevant')
neg_catalog = sub_catalog.filter_catalog(filters)

print('Positive documents: ',len(pos_catalog.documents))
print('Negative documents: ',len(neg_catalog.documents))

Positive documents:  15
Negative documents:  0


#### Working for the Positive Labels since this is only for testing

In [6]:
catalog = pos_catalog

## Spacy

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [8]:
document = catalog.documents[0]
document.clean_text[:50]

'Application published. OKAZOE, Takashi;Nagasaki, Y'

In [9]:
def spacy_cleaning(
    document,
    tags_to_keep=['JJ', 'NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    entities_to_remove=['ORG,NORP,GPE,PERSON']):

    def pass_test(w, tags=tags_to_keep):
        if w.ent_type_ == 0:
                return w.tag_ in tags and not w.is_punct and not w.is_stop and w.ent_ not in entities_to_remove
        return w.tag_ in tags and not w.is_punct and not w.is_stop 

    words = [ word for word in document if pass_test(word)]
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in words ]
    return ' '.join(tokens)

#### Apply intense cleaning and save Catalog instance

In [61]:
for d,doc in enumerate(catalog.documents):
    print('[INFO]: Parsing doc ',d)
    catalog.documents[d].processed_text = spacy_cleaning(nlp(doc.clean_text))
catalog.save(path=paths['catalog'],name='spacy_pipeline_on_US_corpus')

[INFO]: Parsing doc  0
[INFO]: Parsing doc  1
[INFO]: Parsing doc  2
[INFO]: Parsing doc  3
[INFO]: Parsing doc  4
[INFO]: Parsing doc  5
[INFO]: Parsing doc  6
[INFO]: Parsing doc  7
[INFO]: Parsing doc  8
[INFO]: Parsing doc  9
[INFO]: Parsing doc  10
[INFO]: Parsing doc  11
[INFO]: Parsing doc  12
[INFO]: Parsing doc  13
[INFO]: Parsing doc  14


In [10]:
catalog.documents[0].processed_text[:50]

'application publish asahi glas method produce carb'

### Collect the Corpus for Sklearn

In [11]:
corpus = catalog.collect_corpus(attr='processed_text', form=list)
document = corpus[0]
document[:500]

'application publish asahi glas method produce carbamate compound carbamate compound method produce isocyanate compound present invention relate method produce carbamate compound comprise react fluorine contain carbonic diester compound represent formula aromatic diamine compound represent formula catalyst produce carbamate compound represent formula method produce isocyanate compound represent formula carbamate compound catalyst represent fluorine contain represent divalent divalent divalent aro'

## TFIDF

In [12]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
EMBED_SIZE = 3000 # TODO: Increase

In [14]:
vectorizer = TfidfVectorizer(
    min_df=.1,
    max_df=.7,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    max_features=EMBED_SIZE,
    ngram_range=(1,3),
    lowercase=True,
    stop_words=stopwords.words('english'))

In [15]:
tfidf = pos_catalog.to_matrix(
    vectorizer=vectorizer,
    modelname='TFIDF',
    max_docs=None)
print(tfidf.representation.shape)
tfidf.representation.head()

(15, 3000)


Unnamed: 0,able,absence,absence catalyst,absolute,absolute pressure,accelerate,accompany,accord claim,accord claim compound,accord compound,...,weight step production,weight weight,wt,yield base,yield base addition,yield base example,yield base hexamethylene,yield carbamic,yield carbamic acid,zinc
0,0.0,0.147455,0.147455,0.0,0.0,0.009941,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.024851,0.0,0.0,0.0,0.0,0.0,0.004551
1,0.0,0.0,0.0,0.030103,0.003763,0.0,0.017735,0.010343,0.0,0.0,...,0.0,0.0,0.041129,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.003873,0.004278,0.002852,0.0,0.0,0.001538,0.00084,0.0,0.0,0.001273,...,0.003327,0.001538,0.000649,0.010766,0.004753,0.002852,0.004278,0.001901,0.001901,0.000352
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
def tfidf_to_dataframe(model): # Model
    return pd.DataFrame({
        "word": [ k for k,v in model.token2id.items() ],
        "idf":  [ model.mapping.idf_[v] \
                    for k,v in model.token2id.items()]
        }).sort_values("idf",ascending=False)

def get_most_relevant_terms(
    tfidf_df:pd.DataFrame,
    n_terms:int):
    ''' Return the first max_terms terms relevant by their IDF value '''
    if not isinstance(tfidf_df,pd.DataFrame):
        tfidf_df = tfidf_to_dataframe(tfidf_df)
    return tfidf_df.sort_values(
        by='idf', ascending=False).iloc[:n_terms,:]['word'].tolist()

In [17]:
idf_scores = tfidf_to_dataframe(tfidf)
idf_scores.head()

Unnamed: 0,word,idf
1500,contain carbon,2.673976
1914,inside thin,2.673976
1931,portion return,2.673976
1929,diisocyanate extract,2.673976
1928,addition solution,2.673976


In [19]:
terms = get_most_relevant_terms(idf_scores,n_terms=50)
terms[:10]

['contain carbon',
 'carbonate carbonate carbonate',
 'example reaction apparatus',
 'method present embodiment',
 'acid ester unsubstitute',
 'ester unsubstitute carbamic',
 'group example group',
 'carbon atom substitute',
 'primary amine represent',
 'stirrer place flask']

In [20]:
def kmean_clustering(
    model, #:Model
    num_clusters:int=4, 
    words_per_cluster:int=None):
    '''
    TODO: Consider MiniBatchKMeans
    
    Clusters using k-mean with k words per cluster
    ----------------------------------------------
        The k-words are the k closest to the centroid of that cluster
        Equivalently: the words are the ones most present in the 'fake'
        document represented by the centroid of the cluster

    Inputs:
    -------
        - model: Trained instance of class Model
        - num_clusters: Number of Clusters to look for
        - words_per_cluster: K parameter above

    Returns:
    -------- 
        - Dict key='cluster id', value=k_words_closest_to_centroid
    '''
    # 1. Performs K-Means algorithm to identify clusters
    km = KMeans(
        n_clusters=num_clusters,
        n_jobs=-1)
    km.fit_transform(model.representation)
    # clusters = km.labels_.tolist()

    # Bring K most similar words to centroid
    closests_words_to_centroids = km.cluster_centers_.argsort()[:, :-words_per_cluster:-1] 
    
    cluster_words = defaultdict(list)
    for i in range(num_clusters):
        for idx in closests_words_to_centroids[i, :words_per_cluster]:
            cluster_words[i].append(model.id2token[idx])
    return cluster_words

In [77]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=4)
clusters = km.fit(tfidf.representation)
clusters.cluster_centers_.shape

(4, 3000)

In [78]:
a = clusters.predict(tfidf.representation)

array([2, 1, 2, 3, 1, 0, 2, 2, 3, 2, 0, 2, 0, 1, 0], dtype=int32)

In [22]:
closests_words_to_centroids = km.cluster_centers_.argsort()[:, :-5:-1] 
closests_words_to_centroids

array([[1816, 1905,  691, 2625],
       [1574, 1483,  435,  275],
       [2098, 2964, 1751, 2088],
       [ 192, 2513, 2872, 2687]])

In [23]:
cluster_words = defaultdict(list)
for i in range(4):
    for idx in closests_words_to_centroids[i, :5]:
        cluster_words[i].append(tfidf.id2token[idx])

In [36]:
cluster_words

defaultdict(list,
            {0: ['mixer', 'nozzle', 'conduit', 'static'],
             1: ['isomer', 'include isomer', 'carbamic', 'atom'],
             2: ['polyisocyanate', 'weight', 'membrane', 'polyamide'],
             3: ['aniline', 'second', 'tube', 'stream']})

## What is the importance of each word within the cluster?

#### Only looking IDF

In [74]:
model = tfidf
scores = defaultdict(list)

'''
scores = dict(
    words = defaultdict(list),
    idf = defaultdict(list),
    maxtf_idf = defaultdict(list),
    avg_tf_idf = defaultdict(list),
    norm_tf_idf = defaultdict(list))
'''

def tf_idf_of_word(model,k,v):
    return model.representation[[k]].values, model.mapping.idf_[v]
    
for k,v in model.token2id.items():
    scores['words'].append(k)
    t,i = tf_idf_of_word(model,k,v)
    scores['idf'].append(i)
    scores['max_tf_idf'].append(np.max(t)*i)
    scores['avg_tf_idf'].append(np.max(t)*i)
    scores['norm_tf_idf'].append(np.linalg.norm(t)*i)

In [76]:
pd.DataFrame(scores).head()

Unnamed: 0,words,idf,max_tf_idf,avg_tf_idf,norm_tf_idf
0,carbamate,1.826679,0.540593,0.540593,0.868281
1,fluorine,2.673976,0.854299,0.854299,0.85437
2,carbonic,2.163151,0.272016,0.272016,0.296121
3,represent,1.470004,0.357015,0.357015,0.452279
4,formula,1.693147,0.220037,0.220037,0.356585


In [35]:
def tfidf_to_idf_dict(model):
    # lambda maxtf_idf: tf,idf = max(tf)*idf
    return {
        "word": [ k for k,v in model.token2id.items() ],
        "idf":  [ model.mapping.idf_[v] for k,v in model.token2id.items()]}


def tfidf_to_idf_scores(model): # Model
    d = tfidf_to_idf_dict(model)
    return pd.DataFrame().sort_values("idf",ascending=False)

In [34]:
idf_lookup = dict(tfidf_to_dataframe(tfidf).values)
for c,words in cluster_words.items():
    for word in words:
        print('C{}: {}:{}'.format(c,word,idf_lookup[word]))

C0: mixer:2.6739764335716716
C0: nozzle:2.386294361119891
C0: conduit:2.386294361119891
C0: static:2.386294361119891
C1: isomer:1.3746934494414107
C1: include isomer:2.6739764335716716
C1: carbamic:2.163150809805681
C1: atom:1.3746934494414107
C2: polyisocyanate:1.3746934494414107
C2: weight:1.3746934494414107
C2: membrane:2.163150809805681
C2: polyamide:2.6739764335716716
C3: aniline:1.826678573184468
C3: second:1.3746934494414107
C3: tube:1.5753641449035618
C3: stream:1.826678573184468


In [None]:
### SUBSAMBPLE BY CLSUTERS!!