In [1]:
import numpy as np
import pandas as pd
from groq import Groq
import os
import time

In [2]:
df = pd.read_excel('../../data/OMIEC_07_08_24.xls')

In [3]:
df['Abstract']

0      Thin films of organic mixed ionic electronic c...
1      n-Type organic electrochemical transistors (OE...
2      Mixed ionic-electronic conductors, such as pol...
3      The field of organic mixed ionic-electronic co...
4      The conversion of electrochemical processes in...
                             ...                        
870                                                  NaN
871                                                  NaN
872                                                  NaN
873    Three cases of familial benign chronic pemphig...
874    An in vitro technique was developed for the is...
Name: Abstract, Length: 875, dtype: object

In [11]:
GROQ_API_KEY = 'GROQ_API_KEY'

In [12]:
df.columns

Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',
       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',
       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',
       'Funding Text', 'Cited References', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',
       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',
       'Publication Year', 'Volume', 'Issue', 'Pa

In [13]:
# groq
client = Groq(
    api_key=GROQ_API_KEY,
)

# respostas
responses_list = []

# todos os abstracts
total = len(df['Abstract'])
total_time = 0


for index, row in df.iterrows():
    start_time = time.time()

    # tempo
    print('\n', index, 'of', total, 'remaining estimated time', (total-index)*(total_time/(index+1)))

    # chat request
    completion = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[
            {
                "role": "system",
                "content": "RolePlay as a bot seeking for polymers of the type OMIEC, also known as Organic Mixed Ionic Electronic Conductors."
            },
            {
                "role": "user",
                "content": "For the text I will input next, output only the name of polymers of the type OMIEC, also known as Organic Mixed Ionic Electronic Conductors, \
                that are present in the text. If any polymer that satisfies the condition is found in the text, the output must be only the \
                polymer material names separated by semicolons. If no polymer that satisfies the condition is found in the text, the output \
                should be 'None'. Do not repeat the polymer name more than once. The output should not include the type of material. \
                I want to use the output for future data analysis, so do not output anything else besides the exact names of the polymers as they appear in the text." 
            },
            {
                "role": "assistant",
                "content": "Hello, I will tell you the names of OMIEC-type polymers mentioned in any text you input."
            },
            {
                "role": "user",
                "content": f"{row['Abstract']}"  # Use f-string to format the abstract
            }
        ],
        temperature=1,
        max_tokens=1024,
        top_p=1,
        stream=True,
        stop=None,
    )

    # coletar respostas
    response = ""

    # Stream the completion result and print the content
    for chunk in completion:
        response += chunk.choices[0].delta.content or ""
        print(chunk.choices[0].delta.content or "", end="")

    print()
    
    # listas de respostas
    responses_list.append({ 'index': index, 'response': response })

    # tempo total
    total_time += time.time() - start_time
    


 0 of 875 remaining estimated time 0.0
PEDOT

 1 of 875 remaining estimated time 291.65010237693787
None

 2 of 875 remaining estimated time 310.82285141944885
PEDOT:PolyDADMA; PEDOT:PolyDADMA FSI; PEDOT:PolyDADMA TFSI

 3 of 875 remaining estimated time 327.06704807281494
conducting/semiconducting conjugated polymers

 4 of 875 remaining estimated time 322.3496427536011
PEDOT:PSS

 5 of 875 remaining estimated time 326.36190533638
organic mixed ionic-electronic conducting polymer;

 6 of 875 remaining estimated time 322.7430026871818
PEDOT;PSS

 7 of 875 remaining estimated time 318.8837904930115
Polymer names: Poly(3,4-ethylenedioxythiophene) (PEDOT); Poly(3-hexylthiophene) (P3HT)

 8 of 875 remaining estimated time 321.70966355005896
PEO-PEDOT; PEDOT

 9 of 875 remaining estimated time 324.87000064849855
None

 10 of 875 remaining estimated time 321.00769899108195
PEDOT:PSS

 11 of 875 remaining estimated time 318.8261432647705
Polymer; OMIECs

 12 of 875 remaining estimated time 3

In [14]:
# DataFrame
df_with_responses = pd.DataFrame(responses_list)

# salvar para .csv
df_with_responses.to_excel('../../dataframes/1_OMIEC_RESPONSES.xlsx', index=False)

#### Playground

In [15]:
df_with_responses

Unnamed: 0,index,response
0,0,PEDOT
1,1,
2,2,PEDOT:PolyDADMA; PEDOT:PolyDADMA FSI; PEDOT:Po...
3,3,conducting/semiconducting conjugated polymers
4,4,PEDOT:PSS
...,...,...
870,870,
871,871,
872,872,
873,873,


In [16]:
result = pd.concat([df_with_responses, df], axis=1)
result.drop('index', axis=1, inplace=True)
result = result[result['response'] != 'None']
result = result.reset_index(drop=True)

In [220]:
result.to_excel('1_OMIEC_RESPONSES_output.xlsx', index=False)

In [214]:
polímeros = []
for response in result['response']:
    polímeros = polímeros + response.split(';')

In [215]:
polímeros_processado = [s.strip() for s in polímeros]

In [226]:
serie_polímeros = pd.Series(polímeros_processado)
polímeros_filtrados = serie_polímeros[~serie_polímeros.str.contains(r'OMIEC.*', regex=True)]
polímeros_filtrados = polímeros_filtrados[polímeros_filtrados != '']
polímeros_filtrados = polímeros_filtrados[~polímeros_filtrados.str.contains(r'(?i)\bnone\b', regex=True)]
polímeros_filtrados = polímeros_filtrados[~polímeros_filtrados.str.contains('conduct')]
polímeros_filtrados = polímeros_filtrados[~polímeros_filtrados.str.contains('conjug')]
polímeros_filtrados = polímeros_filtrados[~polímeros_filtrados.str.contains('cellul')]
polímeros_filtrados = polímeros_filtrados[~polímeros_filtrados.str.contains(r'(?i)\bpolymer\b', regex=True)]
polímeros_filtrados = polímeros_filtrados[~polímeros_filtrados.str.contains(r'(?i)\bmixed\b', regex=True)]

In [248]:
replacements = {
    r'.*\((PEDOT:PSS)\)': r'\1',
    r'.*\((PEDOT)\)': r'\1',
    r'.*\((PEDOT-P)\)': r'\1',
    r'.*\((PEDOT-T)\)': r'\1',
    r'poly\(3,4-ethylenedioxythiophene\)': 'PEDOT',
    r'.*\((P3HT)\)': 'PEDOT',
    r'.*\((PVDF)\)': 'PEDOT',
    r'PEDOT:polystyrenesulfonate': 'PEDOT:PSS',
    r'PEDOT:polystyrene sulfonate': 'PEDOT:PSS',
    r'PEDOT\s*:\s*PSS': 'PEDOT:PSS',
    r'PEDOT\/PSS': 'PEDOT:PSS',
    r'PEDOT\/?poly\(4-styrenesulfonate\)': 'PEDOT:PSS',
    r'PEDOT\s*[-]\s*PSS': 'PEDOT:PSS'


}

In [249]:
for pattern, replacement in replacements.items():
    polímeros_filtrados= polímeros_filtrados.str.replace(pattern, replacement, regex=True, case=False)

In [250]:
len(serie_polímeros)

683

In [251]:
len(polímeros_filtrados.unique())

432

In [252]:
polímeros_filtrados.unique()

array(['PEDOT', 'PEDOT:PolyDADMA', 'PEDOT:PolyDADMA FSI',
       'PEDOT:PolyDADMA TFSI', 'PEDOT:PSS', 'PSS', 'PEO-PEDOT',
       'PANI/PSS', 'poly( vinylferrocene) (PVF)', 'P(3-MT-TTF)', 'PTFE',
       'P(VDF-TrFE)', 'PBTTT', 'PVDF', 'P3HT', 'F4TCNQ', 'BPEEPTF', 'TTF',
       'Bipolar polymers.', 'PEDOT-T', 'PEDOT-P',
       "Poly(3,3'-dialkylimidazolium triflate)",
       'Poly(tetraphenylphosphonium tetracyanometate)',
       'Poly(3-methylthiophene)', 'Poly(o-phenylenediamine)',
       'Poly(3-phenyl-phenylene sulfide)', 'EDOTS', 'EDOTCOOH', 'PProDOT',
       'Polymerized ionic liquid', 'PMDT-TTF', 'TEDFE', 'PT-CSA-O-PTH',
       'PT-TSPT', 'HPTS-PDBT.', 'EG/GOPS-PEDOT:PSS', 'pg2T-TT',
       'f-BTI2g-TVTF', 'f-BTI2g-TVTCl', 'Polyparaphenylene',
       'Ni-3(HITP)(2)', 'PEDOT-Cl', '[C(2)mpyr][FSI]', 'poly-FADS',
       'gNR-Bu', 'PPSS', 'PPTFSI', 'f-BTI2g-SVSCN', 'f-BSeI2g-SVSCN',
       'PyQt', 'PBTTT-C8', 'OMEC-5', '(C16H33CH2CH2CO2)2Sn', 'DTPT',
       'PTAA-DMB', 'OMEM', 'OMEC-3

In [257]:
df_polímeros = pd.DataFrame(polímeros_filtrados, columns=['Polímeros'])

In [258]:
contagem = df_polímeros['Polímeros'].value_counts()

In [259]:
contagem.head(30)

Polímeros
PEDOT:PSS               65
PEDOT                   34
P3HT                    18
TTF-CA                   6
PEO                      6
polythiophene            4
PANI                     4
MEH-PPV                  4
BBL                      3
PSS                      3
PTAA                     3
Polyaniline              3
PPV                      2
Polypyrrole              2
P3MEEET                  2
LiTFSI                   2
CH3NH3PbI3               2
poly(ethylene oxide)     2
Homo-gDPP                2
NDI-T2                   2
PSSNa                    2
Spiro-OMeTAD             2
PPy                      2
g4T2-T2                  2
(BEDT-TTF)               2
PTEO                     2
Polythiophene            2
PTh                      2
p(g2T-T)                 2
P3T                      2
Name: count, dtype: int64

In [260]:
np.unique(df_polímeros[df_polímeros['Polímeros'].str.contains(r'PEDOT', regex=True)].values.tolist())


array(['EG/GOPS-PEDOT:PSS', 'PEDOT', 'PEDOT-Cl', 'PEDOT-MoS2', 'PEDOT-P',
       'PEDOT-PBA', 'PEDOT-T', 'PEDOT-b-PEG',
       'PEDOT-cobalt(II) hexylridge', 'PEDOT-silver perchlorate',
       'PEDOT:PSS', 'PEDOT:PSS.', 'PEDOT:PSS/SPEEK', 'PEDOT:PolyDADMA',
       'PEDOT:PolyDADMA FSI', 'PEDOT:PolyDADMA TFSI', 'PEDOT:Tos',
       'PEO-PEDOT', 'МIP-PEDOT-PBA'], dtype='<U27')

In [262]:
df_polímeros.to_excel('1_OMIEC_RESPONSES_output_preprocessed.xlsx', index=False)

In [None]:
df