In [1]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from tqdm._utils import _term_move_up
from copy import copy
import tempfile

# pandas 경고 무시
pd.options.mode.chained_assignment = None

In [2]:
#### 파일 ####

# 파일 경로
file_directory = "../sample-dataset/sample.csv"

# 파일 이름
file_name = file_directory.split("/")[-1]

In [3]:
#### 전역 변수 ####

df_dataset = pd.read_csv(file_directory)

temp_set = copy(df_dataset)

column_comp = temp_set.columns[0]

# 임시 파일 생성
file_temp = tempfile.mktemp()

In [4]:
#### 분리를 위한 함수들 ####

# is_int(var) - 문자열의 원소가 숫자인지 검사하는 함수
def is_int(var):
    try:
        temp_var = int(var) # 정수형으로 변환 시도
        return True # 에러가 발생하지 않으면 True 반환(정수임을 의미)
    except:
        return False # 에러가 발생하면 False 반환(알파벳임을 의미)

# element_position_finder(compound) - 문자열(화학식)에서 알파벳들의 위치를 파악하는 함수
def element_position_finder(compound):
    position = []
    for i in range(0, len(compound)):
        if compound[i]=='.': 
            continue
        if is_int(compound[i])!= True:
            position.append(i)
        else: 
            continue
    # print(compound)
    # print(position)
    return position
            
# constant_split(compound, position) - 화학식에서 상수를 분리해내는 함수
def constant_split(comp, pos):
    constant = []
    constant.clear()
    for i in range(0, len(pos)):
        if i != (len(pos)-1):
            if (pos[i+1] - pos[i])!= 1:
                constant.append(comp[pos[i]+1:pos[i+1]])
        else:
            try:
                constant.append(comp[pos[i]+1:])
            except:
                break
    # print(constant)
    return constant

        
# compound_split(compound, pos) - 화학식에서 원소를 분리해내는 함수
def element_split(comp, pos):
    elements = []
    elements.clear()
    i=0
    while i < len(pos):
        if i != (len(pos)-1):
            if (pos[i+1] - pos[i])== 1:
                elements.append(comp[pos[i]:pos[i]+2])
                i += 2
            else:
                elements.append(comp[pos[i]])
                i += 1
        else:
            elements.append(comp[pos[i]])
            i+=1
        
        # print(elements)
    return elements


# joiner(elements, constant) - 분리해낸 원소들 리스트와 상수들 리스트를 결합해내는 함수
# 최종 출력 element1, const1, element2, const2, ...
def joiner(elements, constant):
    encoded = []
    encoded.clear()
    for i in range(0, len(elements)):
        encoded.append(elements[i])
        encoded.append(constant[i])
    
    # print(encoded)
    fin = ','.join(encoded)
    
    
    # print(fin)
    return fin

def encoder(comp):
    el_pos = element_position_finder(comp)
    el_const_list = constant_split(comp, el_pos)
    el_list = element_split(comp, el_pos)
    return joiner(el_list, el_const_list)

In [5]:
#### 문자열 수정 처리 함수 ####
def revise_func(f_temp):
    with open(f_temp, "rt") as file_in:
        file_output = file_name.replace(".csv", ".txt")
        with open("../output/" + file_output, "wt") as file_out:
            lines = file_in.readlines()
            lines[0] = lines[0].replace(column_comp + ",", column_comp + " ")
            file_out.write(lines[0])
            i = 1
            while i < len(temp_set.index)+1 :
                lines[i] = lines[i].replace("\",", " ")
                lines[i] = lines[i].replace("\"", "")
                file_out.write(lines[i])
                i += 1
            file_out.close()
        file_in.close()
        print("\noutput/%s" %file_output)

In [6]:
#### 총 작업 함수 ####

# original
def work_func():
    print('')
    for i in range(len(temp_set.index)):
        print("%30s   =====>   %-60s   %6d/%-6d" %(df_dataset[column_comp][i], "", i+1, len(temp_set.index)), end="\r")
        temp_set[column_comp][i] = encoder(df_dataset[column_comp][i])
        print("%30s   =====>   %-60s" %(df_dataset[column_comp][i], temp_set[column_comp][i]), end="\r")
    temp_set.to_csv(file_temp, index=False)
    revise_func(file_temp)
    

#### 진행률 표시가 포함된 총 작업 함수 ####

# jupyter notebook
def work_func_jn():
    for i in tqdm_notebook(range(len(temp_set.index))):
        print("%30s   =====>   %-60s" %(df_dataset[column_comp][i], ""), end="\r")
        temp_set[column_comp][i] = encoder(df_dataset[column_comp][i])
        print("%30s   =====>   %-60s" %(df_dataset[column_comp][i],temp_set[column_comp][i]), end="\r")
    temp_set.to_csv(file_temp, index=False)
    revise_func(file_temp)

# cli - with tqdm progressbar
def work_func_cli():
    prefix = _term_move_up() + '\r'
    print('')
    for i in tqdm(range(len(temp_set.index))):
        tqdm.write(prefix + "%30s   =====>   %-60s" %(df_dataset[column_comp][i], ""))
        temp_set[column_comp][i] = encoder(df_dataset[column_comp][i])
        tqdm.write(prefix + "%30s   =====>   %-60s" %(df_dataset[column_comp][i], temp_set[column_comp][i]))
    temp_set.to_csv(file_temp, index=False)
    revise_func(file_temp)

In [7]:
work_func()


  La1.85Sr0.15Cu0.985Fe0.015O4   =====>   La,1.85,Sr,0.15,Cu,0.985,Fe,0.015,O,4                             231/231       

In [8]:
work_func_jn()

HBox(children=(IntProgress(value=0, max=231), HTML(value='')))

  La1.85Sr0.15Cu0.985Fe0.015O4   =====>   La,1.85,Sr,0.15,Cu,0.985,Fe,0.015,O,4                           
