# Reproduce results of trained CNN & CNN-Attention models 

In [2]:
import json
from collections import Counter
from sklearn.model_selection import train_test_split
import kashgari
from kashgari.tasks.classification import CNN_Attention_Model, CNN_Model

# load mixed-topic dataset with opinion/non-opinion annotation
with open('data/3_in_1/cnn_x.json', 'r', encoding='utf-8') as f:
    store_x = json.load(f)
with open('data/3_in_1/cnn_y.json', 'r', encoding='utf-8') as f:
    store_y = json.load(f)
with open('data/3_in_1/testcnn_x.json', 'r', encoding='utf-8') as f:
    test_x = json.load(f)
with open('data/3_in_1/testcnn_y.json', 'r', encoding='utf-8') as f:
    test_y = json.load(f)  

# split train & dev set
valid_x, train_x, valid_y, train_y = train_test_split(store_x, store_y, test_size=0.95, random_state=42)

print(f"train sample: {len(train_x)}")
print(f"dev sample: {len(valid_x)}")
print(f"test sample: {len(test_x)}")
print(Counter(train_y))
print(Counter(valid_y))
print(Counter(test_y))

# will spend about 2 minutes
load_CNN = CNN_Model.load_model('data/CNN_94_95_93')
load_CNN.evaluate(test_x, test_y)

train sample: 19952
dev sample: 1050
test sample: 1105
Counter({'non-opinion': 11746, 'opinion': 8206})
Counter({'non-opinion': 612, 'opinion': 438})
Counter({'non-opinion': 642, 'opinion': 463})


2023-08-12 01:17:57,358 [DEBUG] kashgari - ------------------------------------------------
2023-08-12 01:17:57,358 [DEBUG] kashgari - Loaded gensim word2vec model's vocab
2023-08-12 01:17:57,359 [DEBUG] kashgari - model        : mat_embedding3/vectors.txt
2023-08-12 01:17:57,359 [DEBUG] kashgari - word count   : 529690
2023-08-12 01:17:57,360 [DEBUG] kashgari - Top 50 words : ['the', 'of', '.', ',', 'and', '<nUm>', 'in', 'a', 'to', ')', '(', 'with', '-', 'for', 'is', 'by', 'on', 'was', 'at', 'were', 'that', '–', 'as', 'are', 'from', '/', 'an', 'temperature', 'surface', 'using', 'high', 'which', 'C', '°', 'this', '%', 'In', 'it', 'A', '=', 'structure', 'properties', ':', 'phase', 'results', 'effect', 'these', 'than', 'based', 'different']
2023-08-12 01:17:57,361 [DEBUG] kashgari - ------------------------------------------------


              precision    recall  f1-score   support

 non-opinion     0.9446    0.9564    0.9505       642
     opinion     0.9385    0.9222    0.9303       463

    accuracy                         0.9421      1105
   macro avg     0.9415    0.9393    0.9404      1105
weighted avg     0.9420    0.9421    0.9420      1105



{'detail': {'non-opinion': {'precision': 0.9446153846153846,
   'recall': 0.956386292834891,
   'f1-score': 0.9504643962848297,
   'support': 642},
  'opinion': {'precision': 0.9384615384615385,
   'recall': 0.9222462203023758,
   'f1-score': 0.9302832244008714,
   'support': 463},
  'accuracy': 0.9420814479638009,
  'macro avg': {'precision': 0.9415384615384615,
   'recall': 0.9393162565686334,
   'f1-score': 0.9403738103428505,
   'support': 1105},
  'weighted avg': {'precision': 0.9420368952314654,
   'recall': 0.9420814479638009,
   'f1-score': 0.9420083939479313,
   'support': 1105}},
 'precision': 0.9420368952314654,
 'recall': 0.9420814479638009,
 'f1-score': 0.9420083939479313,
 'support': 1105}

In [3]:
# load mixed-topic dataset with opportunity(driver)/challenges(barrier) annotation
with open('data/3_in_1/cnnatt_x.json', 'r', encoding='utf-8') as f:
    x_data = json.load(f)
with open('data/3_in_1/cnnatt_y.json', 'r', encoding='utf-8') as f:
    y_data = json.load(f)
with open('data/3_in_1/testcnnatt_x.json', 'r', encoding='utf-8') as f:
    test_x = json.load(f)
with open('data/3_in_1/testcnnatt_y.json', 'r', encoding='utf-8') as f:
    test_y = json.load(f)
    
# add augmented data by SMOTE
with open('data/3_in_1/augmented_barrier.json') as file_obj:
    barrier = json.load(file_obj)
    length = len(barrier)
    x_data.extend(barrier)
    tmp = ['barrier']*length
    y_data.extend(tmp)

# split train & dev set
valid_x, train_x, valid_y, train_y = train_test_split(x_data, y_data, stratify=y_data, test_size=0.91, random_state=42)

print(f"train sample: {len(train_x)}")
print(f"dev sample: {len(valid_x)}")
print(f"test sample: {len(test_x)}")
print(Counter(train_y))
print(Counter(valid_y))
print(Counter(test_y))

load_CNN_Attention = CNN_Attention_Model.load_model('data/CNN_Attention_91')
load_CNN_Attention.evaluate(test_x, test_y)

train sample: 9224
dev sample: 912
test sample: 950
Counter({'driver': 6337, 'barrier': 2887})
Counter({'driver': 627, 'barrier': 285})
Counter({'driver': 774, 'barrier': 176})


2023-08-12 01:19:43,796 [DEBUG] kashgari - ------------------------------------------------
2023-08-12 01:19:43,797 [DEBUG] kashgari - Loaded gensim word2vec model's vocab
2023-08-12 01:19:43,797 [DEBUG] kashgari - model        : mat_embedding3/vectors.txt
2023-08-12 01:19:43,798 [DEBUG] kashgari - word count   : 529690
2023-08-12 01:19:43,799 [DEBUG] kashgari - Top 50 words : ['the', 'of', '.', ',', 'and', '<nUm>', 'in', 'a', 'to', ')', '(', 'with', '-', 'for', 'is', 'by', 'on', 'was', 'at', 'were', 'that', '–', 'as', 'are', 'from', '/', 'an', 'temperature', 'surface', 'using', 'high', 'which', 'C', '°', 'this', '%', 'In', 'it', 'A', '=', 'structure', 'properties', ':', 'phase', 'results', 'effect', 'these', 'than', 'based', 'different']
2023-08-12 01:19:43,799 [DEBUG] kashgari - ------------------------------------------------


              precision    recall  f1-score   support

     barrier     0.7727    0.7727    0.7727       176
      driver     0.9483    0.9483    0.9483       774

    accuracy                         0.9158       950
   macro avg     0.8605    0.8605    0.8605       950
weighted avg     0.9158    0.9158    0.9158       950



{'detail': {'barrier': {'precision': 0.7727272727272727,
   'recall': 0.7727272727272727,
   'f1-score': 0.7727272727272727,
   'support': 176},
  'driver': {'precision': 0.9483204134366925,
   'recall': 0.9483204134366925,
   'f1-score': 0.9483204134366925,
   'support': 774},
  'accuracy': 0.9157894736842105,
  'macro avg': {'precision': 0.8605238430819826,
   'recall': 0.8605238430819826,
   'f1-score': 0.8605238430819826,
   'support': 950},
  'weighted avg': {'precision': 0.9157894736842105,
   'recall': 0.9157894736842105,
   'f1-score': 0.9157894736842105,
   'support': 950}},
 'precision': 0.9157894736842105,
 'recall': 0.9157894736842105,
 'f1-score': 0.9157894736842105,
 'support': 950}

# Apply opinion mining to plain text

In [4]:
# load a tiny text sample with opinions
with open('data/text.txt', 'r') as f:
    text = f.read()
text = text.replace('\n', ' ')

In [5]:
import nltk # if no nltk, use split('. ')

# 1. segmentation + tokenization
def sen_seg(data): 
    sens = []
    to_replace = ['et al. ', 'Fig. ', 'e.g. ', 'i.e. ', 'Ref. ', 'Figs. ', ' ca. ', 'approx. ', '(ca. ', 'etc.) ']
    for tr in to_replace:
        data = data.replace(tr, tr[:-2]+'####@')
    tmp = nltk.sent_tokenize(data)
    # tmp = data.split('. ')
    for i, t in enumerate(tmp):
        for tr in to_replace:
            t = t.replace(tr[:-2]+'####@', tr)
        tmp[i] = t
    for t in tmp:
        sens.append(nltk.word_tokenize(t))
        # sens.append(t.split(' '))
    return sens

sens = sen_seg(text)
for t in sens:
    print(t)

['Owing', 'to', 'the', 'importance', 'of', 'surface', 'passivation', 'to', 'BSi', ',', 'major', 'passivation', 'techniques', 'using', 'SiNx', ',', 'thermal', 'oxide', ',', 'Al2O3', 'and', 'a-Si', 'have', 'been', 'critically', 'examined', '.']
['It', 'is', 'found', 'that', 'atomic', 'layer', 'deposited', 'Al2O3', 'offers', 'excellent', 'surface', 'conformality', 'and', 'passivation', 'to', 'the', 'silicon', 'surface', ',', 'especially', 'on', 'p+-emitters', '.']
['With', 'ALD', 'Al2O3', 'passivation', ',', 'a', 'record', 'high', '18.7', '%', 'efficient', 'BSi', 'solar', 'cell', 'has', 'been', 'successfully', 'fabricated', '.']
['As', 'the', 'market', 'share', 'of', 'n-type', 'solar', 'cells', '(', 'with', 'p+-emitters', ')', 'is', 'expected', 'to', 'rise', 'in', 'the', 'near', 'future', ',', 'this', 'passivation', 'technique', 'is', 'particularly', 'attractive', 'and', 'may', 'become', 'a', 'new', 'industry', 'standard', '.']
['Unfortunately', ',', 'the', 'poor', 'capacity', 'stability'

In [7]:
# 2. apply opinion extraction
find_opinion = []
result = load_CNN.predict(sens)
for i, tx in enumerate(sens):
    if result[i] == 'opinion':
        find_opinion.append(tx)
        
print(len(find_opinion), 'opinions are found by CNN')
print(find_opinion)

7 opinions are found by CNN
[['It', 'is', 'found', 'that', 'atomic', 'layer', 'deposited', 'Al2O3', 'offers', 'excellent', 'surface', 'conformality', 'and', 'passivation', 'to', 'the', 'silicon', 'surface', ',', 'especially', 'on', 'p+-emitters', '.'], ['As', 'the', 'market', 'share', 'of', 'n-type', 'solar', 'cells', '(', 'with', 'p+-emitters', ')', 'is', 'expected', 'to', 'rise', 'in', 'the', 'near', 'future', ',', 'this', 'passivation', 'technique', 'is', 'particularly', 'attractive', 'and', 'may', 'become', 'a', 'new', 'industry', 'standard', '.'], ['Unfortunately', ',', 'the', 'poor', 'capacity', 'stability', ',', 'especially', 'at', 'elevated', 'temperature', ',', 'hinders', 'its', 'practical', 'utilization', '.'], ['However', ',', 'the', 'shuttling', 'effect', 'caused', 'by', 'the', 'dissolution', 'of', 'polysulfides', 'seriously', 'degrades', 'their', 'electrochemical', 'performance', '.'], ['Numerous', 'Li-ion', 'conducting', 'solids', 'are', 'known', 'today', ',', 'however', 

## Optional corpus comparison method to add/remove opinions to certain ratio
### Note: If you don't need this part, just skip it and run next cell

In [42]:
# optional function: corpus comparison to adjust number of opinions
def rank_opinion_by_lexicon(sens, lexicon_file):
    words_md = ['could', 'may', 'would', 'must', 'might', 'shall', 'ought', 'can']
    score_dic = {}
    with open(lexicon_file) as file_obj:
        pri_list = json.load(file_obj)
    for i, s in enumerate(sens):
        score = 0
        for ss in s:
            if ss.lower() in pri_list or ss in pri_list:
                score += 1
            if ss.lower() in words_md:
                score += 2
        score_dic[i] = score
    
    sorted_dic = sorted(score_dic.items(), key=lambda item:item[1], reverse=True)
    rank_opinion = []
    for sd in sorted_dic:
        rank_opinion.append(sens[sd[0]])
    return rank_opinion


RATE = 0.25 # a threshold which can be set by user, if whole page, 0.1-0.2 is rather suitable
support_opinion = rank_opinion_by_lexicon(sens, 'data/final_200.json')
need_length = int(RATE * len(sens))
count = 0
tmp = []
if len(find_opinion) >= need_length:
    print('Too many opinions, so rank & remove low score opinion')
    for so in support_opinion:
        if so in find_opinion:
            tmp.append(so)
            if len(tmp)==need_length:
                break
else: 
    print('Too few opinions, add high score candidate')
    rest = [x for x in support_opinion if x not in find_opinion]
    tmp.extend(find_opinion)
    tmp.extend(rest[:need_length-len(find_opinion)])

find_opinion = tmp
        
print(len(find_opinion), 'opinions found in total by', RATE, '*', str(len(sens)))
print(find_opinion)

Too many opinions, so rank & remove low score opinion
1 opinions found in total by 0.25 * 4
[['As', 'the', 'market', 'share', 'of', 'n-type', 'solar', 'cells', '(', 'with', 'p+-emitters', ')', 'is', 'expected', 'to', 'rise', 'in', 'the', 'near', 'future', ',', 'this', 'passivation', 'technique', 'is', 'particularly', 'attractive', 'and', 'may', 'become', 'a', 'new', 'industry', 'standard', '.']]


In [8]:
# 2. apply opinion classification
find_opps = []
find_chas = []
result = load_CNN_Attention.predict(find_opinion)
for i, tx in enumerate(find_opinion):
    if result[i] == 'driver':
        find_opps.append(tx)
    else:
        find_chas.append(tx)
        
print(len(find_opps), 'opportunities are found by CNN-Attention.')
for fo in find_opps:
    print(' '.join(fo))
print('\n')
print(len(find_chas), 'challenges are found by CNN-Attention.')
for fc in find_chas:
    print(' '.join(fc))

3 opportunities are found by CNN-Attention.
It is found that atomic layer deposited Al2O3 offers excellent surface conformality and passivation to the silicon surface , especially on p+-emitters .
As the market share of n-type solar cells ( with p+-emitters ) is expected to rise in the near future , this passivation technique is particularly attractive and may become a new industry standard .
This synergistic control of nano-/macro-structures is a promising concept for enhancing battery performance and its cycle life .


4 challenges are found by CNN-Attention.
Unfortunately , the poor capacity stability , especially at elevated temperature , hinders its practical utilization .
However , the shuttling effect caused by the dissolution of polysulfides seriously degrades their electrochemical performance .
Numerous Li-ion conducting solids are known today , however the stability of most of these is too low to engender widespread usage .
Lithium transition metal oxides are prevalent cathod

## In the ALD real case, these sentence were found in SSNet results. We identified Li in the sentence and we traced their information in meta data records as shown in follows:

"Unfortunately , the poor capacity stability , especially at elevated temperature , hinders its practical utilization .": {
        "time": "2018",
        "id": "LITHIUM-ION BATTERIES; CATHODE MATERIALS; SURFACE MODIFICATION; CYCLING",
        "mat": [
            "lithium"
        ],
        "attitude": "barrier",
        "doi": "10.1021/acssuschemeng.8b01081"
    },

"However , the shuttling effect caused by the dissolution of polysulfides seriously degrades their electrochemical performance .": {
        "time": "2018",
        "id": "ATOMIC LAYER DEPOSITION; LI-S BATTERIES; CARBON NANOTUBES;",
        "mat": [
            "Li"
        ],
        "attitude": "barrier",
        "doi": "10.1007/s10008-017-3818-6"
    },
    
"Numerous Li-ion conducting solids are known today , however the stability of most of these is too low to engender widespread usage .": {
        "time": "2018",
        "id": "ATOMIC LAYER DEPOSITION; SOLID-STATE BATTERIES; IONIC-CONDUCTIVITY;",
        "mat": [
            "Li"
        ],
        "attitude": "barrier",
        "doi": "10.1039/c7ta07928a"
    },
    
"Lithium transition metal oxides are prevalent cathode materials currently , but they face great challenges due to unsatisfactory energy density , chemical/electrochemical instability , and elemental scarcity concerns .": {
        "time": "2018",
        "id": "ATOMIC LAYER DEPOSITION; LI-ION BATTERIES; HIGH-PERFORMANCE CATHODE;",
        "mat": [
            "Lithium transition metal oxides"
        ],
        "attitude": "barrier",
        "doi": "10.1002/aenm.201802057"
    },
    
"This synergistic control of nano-/macro-structures is a promising concept for enhancing battery performance and its cycle life .": {
        "time": "2019",
        "id": "CATHODE MATERIAL; CAPACITY; LINI0.5MN1.5O4; MECHANISM; SHAPE",
        "mat": [
            "LiMn1.5Ni0.5O4"
        ],
        "attitude": "driver",
        "doi": "10.1002/batt.201800091"
    }