# <u>Convert GComp.csv file to composition table as is to use by Pandas dataframe (mol%)</u>

### Some notes here:
### (1) SciGlass mdb files are publised by EPAM under ODC Open Database License (ODbL). Use the information and codes below following the EPAM's license and also at your own risk. Understand that there is no guarantee for any problems that may occur.
### (2) The following code is written to work with csv files. Access 2.0 mdb files published by EPAM should be converted to csv files and then executed. Please convert the mdb files to csv files first and use them.
### (3) File names and contents (presumed)
    - Gcomp:      composition data
    - SciGK:      property data together with typical compositions (wt% and mol%)
    - Reference:  source data such as journals or patents
    - Kod2Ref:    connection keys from "kod" to "Refer_ID" to quote source data in the "Reference" file

### (4) You need only "GComp.csv" file in this notebook.
<br>

In [1]:
# Import libralies

import pandas as pd
import numpy as np

In [2]:
# Load "GComp.csv" file

df = pd.read_csv('data_SciGlass/GComp.csv')
print(df.shape)
df.head()

(422879, 3)


Unnamed: 0,Kod,GlasNo,Composition
0,204,20000,SiO260.08100.100.
1,205,20001,SiO260.0835.945.35P2O5141.9421.2311.35...
2,205,20002,SiO260.0830.1739.99P2O5141.9420.1711.3...
3,205,20003,SiO260.0839.48.64P2O5141.9416.998.97Z...
4,205,20004,SiO260.0838.3348.24P2O5141.9416.088.57...


In [3]:
# Check the data in the 2nd row and the 3rd column as an example

comp_example_1 = df.iloc[1, 2]
comp_example_1

'\x7fSiO2\x7f60.08\x7f35.9\x7f45.35\x7fP2O5\x7f141.94\x7f21.23\x7f11.35\x7fZrO2\x7f123.22\x7f15.12\x7f9.31\x7fNa2O\x7f61.98\x7f27.76\x7f33.99\x7f'

## The example above shows that element, molecular mass, wt%, and mol% are lined up in one set between the separater "\x7f". Then split them by the separater.

In [4]:
# split by "\x7f"

comp_0 = df['Composition'].str.split('\x7f')
comp_0

0                             [, SiO2, 60.08, 100., 100., ]
1         [, SiO2, 60.08, 35.9, 45.35, P2O5, 141.94, 21....
2         [, SiO2, 60.08, 30.17, 39.99, P2O5, 141.94, 20...
3         [, SiO2, 60.08, 39., 48.64, P2O5, 141.94, 16.9...
4         [, SiO2, 60.08, 38.33, 48.24, P2O5, 141.94, 16...
                                ...                        
422874    [, SiO2, 60.08, 55.26, 62.01, Al2O3, 101.96, 1...
422875    [, SiO2, 60.08, 64., 72.95, Al2O3, 101.96, 16....
422876    [, SiO2, 60.08, 61.9, 71.78, Al2O3, 101.96, 15...
422877    [, SiO2, 60.08, 58.2, 66.83, Al2O3, 101.96, 29...
422878    [, SiO2, 60.08, 62.2, 68.28, Al2O3, 101.96, 20...
Name: Composition, Length: 422879, dtype: object

In [5]:
# Check the data in the 2nd row and the 3rd column as an example

comp_ex = list(comp_0[1])
print(comp_ex)
print(len(comp_ex))

['', 'SiO2', '60.08', '35.9', '45.35', 'P2O5', '141.94', '21.23', '11.35', 'ZrO2', '123.22', '15.12', '9.31', 'Na2O', '61.98', '27.76', '33.99', '']
18


In [6]:
# Extract composition and mol% data utilizing the remainder divided by 4

print('Composition,  mol%')
for i in range(len(comp_ex)//4):
    print(comp_ex[i * 4 + 1], comp_ex[i * 4 + 4])

Composition,  mol%
SiO2 45.35
P2O5 11.35
ZrO2 9.31
Na2O 33.99


## Now element, molecular mass, wt%, and mol% can be extraced.

In [7]:
# Check the number of data in the longest row.

max_compostion_num = 0
line_length = []

for j in list(comp_0):
    if len(j) > max_compostion_num:
        max_compostion_num = len(j)
    line_length.append(len(j))

print(max_compostion_num)

162


## The longest row in the "GComp" contains 40 compositions (162 devided by 4) and more than the number of composition in "SciGK" (that is only 17). "GComp" should be used as the composition data, so make the matrix that composition forms a column, and glass forms a row with "GlasNo" (glass ID) and "Kod" (Reference_ID).

In [8]:
# Get the number of the lomgest row

max_idx = np.argmax(line_length)
max_idx

275302

In [9]:
# Check the comosition and mol% in the longest row and its total

comp_longest = comp_0[max_idx]
comp_total = 0
print('Composition,  mol%\n')

for i in range(len(comp_longest)//4):
    print(comp_longest[i * 4 + 1], comp_longest[i * 4 + 4])
    comp_total = comp_total + float(comp_longest[i * 4 + 4])
    
print('\nTotal mol% = ', comp_total)

Composition,  mol%

SiO2 51.33
B2O3 13.01
Al2O3 1.57
Li2O 5.94
Na2O 14.49
MgO 2.92
CaO 5.01
TiO2 0.77
Se 0.013
Rb2O 0.018
SrO 0.058
Y2O3 0.035
ZrO2 0.32
MoO3 0.38
RuO2 0.24
Rh2O3 0.031
PdO 0.13
Ag2O 0.006
CdO 0.008
SnO 0.006
Sb2O3 0.
TeO2 0.049
Cs2O 0.13
BaO 0.18
La2O3 0.08
CeO2 0.12
Pr2O3 0.055
Nd2O3 0.24
Sm2O3 0.044
Eu2O3 0.005
Gd2O3 0.011
UO3 0.27
Cr2O3 0.2
MnO 0.3
FeO 1.59
NiO 0.24
CuO 0.002
ZnO 0.001
K2O 0.03
P2O5 0.16

Total mol% =  99.99199999999998


## Make a list of element contained in "GComp" without any omission and duplication

In [10]:
composition_items = []

for k in range(len(df)):
    composition_k = list(comp_0[k])
    for l in range(len(composition_k)//4):
        composition_item_kl = composition_k[l * 4 + 1]
        composition_items.append(composition_item_kl)             # Add composition name to the list
        composition_items_update = set(composition_items)         # Delete the duplication
        composition_items = list(composition_items_update)        # Convert back to list
    
    if k%50000 == 0:                                              # Show the transaction status
        print('row number in transaction :  ', k)

#composition_items_update

print('\ncomplete!')

row number in transaction :   0
row number in transaction :   50000
row number in transaction :   100000
row number in transaction :   150000
row number in transaction :   200000
row number in transaction :   250000
row number in transaction :   300000
row number in transaction :   350000
row number in transaction :   400000

complete!


In [11]:
# Check the element names in the list made above

composition_items_sorted = sorted(composition_items)
print('Number of elements (all) =  ', len(composition_items_sorted), '\n')
print(*composition_items_sorted)

Number of elements (all) =   726 

 (NH4)2SO4 (NH4)3PO4 Ag Ag2CO3 Ag2MoO4 Ag2O Ag2S Ag2SO4 Ag2Se Ag2Se5 Ag2Te Ag4SSe AgAsS2 AgBr AgCl AgF AgGaS2 AgI AgNO3 AgPO3 Al Al(PO3)3 Al2(SO4)3 Al2N3 Al2O3 Al2O3+Fe2O3 Al2S3 Al2Y2O6 AlCl3 AlF3 AlN AlPO4 Am Am2O3 AmO2 Ar As As2O3 As2O5 As2S3 As2S5 As2Se3 As2Se5 As2Te As2Te3 As2Te5 AsBr3 AsF5 AsI3 AsS AsS2 AsSBr AsSI AsSe AsSe2 AsSeI AsTe AsTe3 Au Au2O Au2O3 AuCl AuCl3 B B2O3 B2S3 B2Se3 BF3 BN BOF BPO4 BS2 Ba Ba(H2PO4)2 Ba(PO3)2 Ba3N2 BaB2O4 BaBr2 BaCl2 BaF2 BaGeO3 BaHPO4 BaI2 BaO BaO2 BaPO3F BaS BaSO4 BaSe Be BeF2 BeO BeSO4 Bi Bi2O3 Bi2O5 Bi2S3 Bi2Se3 Bi2Te3 BiBr3 BiCl3 BiF3 BiI3 BiNbO4 BiOBr BiOCl BiOF BiPO4 BiTe Br C C2H5OH C6H12O6 CO2 Ca Ca(NO3)2 Ca3N2 CaBr2 CaC2 CaCO3 CaCl2 CaF2 CaI2 CaO CaO+MgO CaS CaSO4 Cd Cd(NO3)2 CdAs2 CdBr2 CdCl2 CdF2 CdGeO3 CdI2 CdO CdS CdSO4 CdSe CdTe Ce Ce2O3 Ce2S3 CeCl3 CeF3 CeF4 CeO CeO2 CeSe Cl Cl2 Co Co2O3 Co3O4 CoBr2 CoCl2 CoF2 CoO CoS CoSO4 Cr Cr2O3 Cr2Se3 Cr3O4 CrCl3 CrF3 CrO CrO3 Cs Cs2O Cs2S Cs2SO4 CsBr CsCl Cs

## 726 element are contained, but it includes symbols like + or R2O.

In [12]:
# Check elements that include the symbol "+" 

[s for s in composition_items_sorted if '+' in s]

['Al2O3+Fe2O3',
 'CaO+MgO',
 'FeO+Fe2O3',
 'HF+H2O',
 'Li2O+Na2O+K2O',
 'MoO3+WO3',
 'Na2O+K2O']

In [13]:
# Check elements that include the symbol "R"

[s for s in composition_items_sorted if 'R' in s]

['R2O',
 'R2O3',
 'RO',
 'Ra',
 'Rb',
 'Rb2O',
 'Rb2S',
 'Rb2SO4',
 'RbBr',
 'RbCl',
 'RbF',
 'RbI',
 'RbNO3',
 'RbV2O5',
 'Re2O7',
 'ReO3',
 'Rh',
 'Rh2O3',
 'RhO2',
 'RmOn',
 'Ru',
 'RuO2']

## The column list that contains all composition is made, so make the matrix with "GlasNo" and "Kod".

In [14]:
# Extract Kod and GlasNo columns from the original dataframe

df_rebuild_0 = df.iloc[:, :2]
df_rebuild_0 = df_rebuild_0.astype(np.int32)
df_rebuild_0

Unnamed: 0,Kod,GlasNo
0,204,20000
1,205,20001
2,205,20002
3,205,20003
4,205,20004
...,...,...
422874,44933,611694
422875,44933,611695
422876,44933,611696
422877,44933,611697


## Create a matrix (422879 x 726) with the number of rows in the "GComp", the columns of all elements, and zeros. (The zeroth column was blank and now zeros, but ignore because of no impact.)

In [15]:
df_composition_mol = pd.DataFrame(np.zeros(len(df) * len(composition_items_sorted)).reshape(len(df), len(composition_items_sorted)),
                                 columns = composition_items_sorted)
print(df_composition_mol.shape)
df_composition_mol.head()

(422879, 726)


Unnamed: 0,Unnamed: 1,(NH4)2SO4,(NH4)3PO4,Ag,Ag2CO3,Ag2MoO4,Ag2O,Ag2S,Ag2SO4,Ag2Se,...,ZnS,ZnSO4,ZnSe,ZnTe,Zr,ZrF4,ZrO2,ZrS2,ZrSe2,ZrSiO4
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Place the mol% data in the appropriate places in the table (it takes some time)

In [16]:
import time
t1 = time.time()

# Place the numbers

for i in range (len(df)):
    for j in range(len(comp_0[i])//4):
        composition = comp_0[i][j * 4 + 1]
        value = comp_0[i][j * 4 + 4]                               # value of mol%
        df_composition_mol.at[i, composition] = float(value)       # Place them to the right places
       
t2 = time.time() 

print('Elapsed time = ', t2 - t1, '\n')
print(df_composition_mol.shape)
df_composition_mol.head()

Elapsed time =  24.025533199310303 

(422879, 726)


Unnamed: 0,Unnamed: 1,(NH4)2SO4,(NH4)3PO4,Ag,Ag2CO3,Ag2MoO4,Ag2O,Ag2S,Ag2SO4,Ag2Se,...,ZnS,ZnSO4,ZnSe,ZnTe,Zr,ZrF4,ZrO2,ZrS2,ZrSe2,ZrSiO4
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.31,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,15.3,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.95,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.59,0.0,0.0,0.0


## Combine the composition data horizontally with the "Kod" and "GlasNo" tables created earlier.

In [17]:
df_SciGlass_mol = df_rebuild_0.join(df_composition_mol)
print(df_SciGlass_mol.shape)
df_SciGlass_mol

(422879, 728)


Unnamed: 0,Kod,GlasNo,Unnamed: 3,(NH4)2SO4,(NH4)3PO4,Ag,Ag2CO3,Ag2MoO4,Ag2O,Ag2S,...,ZnS,ZnSO4,ZnSe,ZnTe,Zr,ZrF4,ZrO2,ZrS2,ZrSe2,ZrSiO4
0,204,20000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
1,205,20001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.310,0.0,0.0,0.0
2,205,20002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,15.300,0.0,0.0,0.0
3,205,20003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.950,0.0,0.0,0.0
4,205,20004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.590,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422874,44933,611694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
422875,44933,611695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.056,0.0,0.0,0.0
422876,44933,611696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.057,0.0,0.0,0.0
422877,44933,611697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0


## Check if the process was done correctly

In [18]:
# Check duplicates ->　No duplicate

df_overlap_all = df_SciGlass_mol[df_SciGlass_mol.duplicated(keep = False)]
print(df_overlap_all.shape)
df_overlap_all.head(40)

(0, 728)


Unnamed: 0,Kod,GlasNo,Unnamed: 3,(NH4)2SO4,(NH4)3PO4,Ag,Ag2CO3,Ag2MoO4,Ag2O,Ag2S,...,ZnS,ZnSO4,ZnSe,ZnTe,Zr,ZrF4,ZrO2,ZrS2,ZrSe2,ZrSiO4


In [19]:
# Compare the unique number of "GlasNo" and the number of rows

print("Number of rows in total   = ", len(df_SciGlass_mol))
print("Unique number of 'GlasNo' = ", df_SciGlass_mol['GlasNo'].nunique())
print("The gap                   = ", len(df_SciGlass_mol) - df_SciGlass_mol['GlasNo'].nunique())

Number of rows in total   =  422879
Unique number of 'GlasNo' =  420731
The gap                   =  2148


In [20]:
# Check the place where "GlasNo"  are overlapped

df_overlap = df_SciGlass_mol[df_SciGlass_mol.duplicated(subset = ['GlasNo'], keep = False)]
print(df_overlap.shape)
df_overlap.head()

(4225, 728)


Unnamed: 0,Kod,GlasNo,Unnamed: 3,(NH4)2SO4,(NH4)3PO4,Ag,Ag2CO3,Ag2MoO4,Ag2O,Ag2S,...,ZnS,ZnSO4,ZnSe,ZnTe,Zr,ZrF4,ZrO2,ZrS2,ZrSe2,ZrSiO4
2209,375,21951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2210,375,21952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2211,375,21953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2212,375,21954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2213,375,21955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## This is not enough to understand, so check the overlaps in the original table as well. (Sorted by "GlasNo")

In [21]:
df_overlap_s = df_overlap.sort_values('GlasNo')
overlap_idx_list = df_overlap_s.index
df_overlap = df.loc[overlap_idx_list]

pd.set_option("display.max_colwidth", 300)
print(df_overlap.shape)
df_overlap[0:40]

(4225, 3)


Unnamed: 0,Kod,GlasNo,Composition
21872,3203,510,SiO260.0875.64.72CaO56.0810.9.25Li2O29.8815.26.03
21863,3202,510,SiO260.0875.64.72CaO56.0810.9.25Li2O29.8815.26.03
21881,3204,518,SiO260.0875.74.81CaO56.0810.10.69Na2O61.9815.14.5
21871,3202,518,SiO260.0875.74.81CaO56.0810.10.69Na2O61.9815.14.5
21880,3203,526,SiO260.0875.78.71CaO56.0810.11.25K2O94.215.10.04
21888,3204,526,SiO260.0875.78.71CaO56.0810.11.25K2O94.215.10.04
21889,3205,526,SiO260.0875.78.71CaO56.0810.11.25K2O94.215.10.04
9764,1887,667,SiO260.0865.6875.K2O94.234.3225.
86226,9853,667,K2O94.234.3225.SiO260.0865.6875.
86254,9854,667,K2O94.234.3225.SiO260.0865.6875.


## They were classified by the combination of "Kod" and "GlasNo", so no problems with the allocation

## Check if the numbers are applied correctly

In [22]:
# Typical compositions found in the "SciGK" file

main_element = ['Kod', 'GlasNo','SiO2', 'Al2O3', 'B2O3', 'CaO', 'K2O', 'Na2O', 'PbO', 'Li2O', 'MgO', 'SrO', 'BaO', 'ZnO']

In [23]:
# The same compositions from the dataframe made from the "GComp" file

df_SciGlass_mol_major = df_SciGlass_mol[main_element]
print(df_SciGlass_mol_major.shape)
df_SciGlass_mol_major.head()

(422879, 14)


Unnamed: 0,Kod,GlasNo,SiO2,Al2O3,B2O3,CaO,K2O,Na2O,PbO,Li2O,MgO,SrO,BaO,ZnO
0,204,20000,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,205,20001,45.35,0.0,0.0,0.0,0.0,33.99,0.0,0.0,0.0,0.0,0.0,0.0
2,205,20002,39.99,0.0,0.0,0.0,0.0,33.4,0.0,0.0,0.0,0.0,0.0,0.0
3,205,20003,48.64,0.0,0.0,0.0,0.0,31.43,0.0,0.0,0.0,0.0,0.0,0.0
4,205,20004,48.24,0.0,0.0,0.0,0.0,30.6,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Load the "SciGK" file

df_scigk = pd.read_csv('data_SciGlass/SciGK.csv', encoding = 'latin1')
print(df_scigk.shape)
df_scigk.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


(422949, 186)


Unnamed: 0,KOD,GLASNO,A_WT,M_WT,SIO2,AL2O3,B2O3,CAO,K2O,NA2O,...,Prop_Code,GForm,any_n,Trademark,Tmax,Vmax,Tn,Io,tcr,tx
0,204,20000,20.02,60.08,100.0,0.0,0.0,0.0,0.0,0.0,...,400 400,1,,,,,,,,
1,205,20001,21.77,71.76,45.35,0.0,0.0,0.0,0.0,33.99,...,130 420 700,1,,,,,,,,
2,205,20002,23.06,79.64,39.98,0.0,0.0,0.0,0.0,33.39,...,130 420 700,1,,,,,,,,
3,205,20003,22.31,74.93,48.64,0.0,0.0,0.0,0.0,31.43,...,130 420 700,1,,,,,,,,
4,205,20004,22.62,75.62,48.24,0.0,0.0,0.0,0.0,30.6,...,130 420 700,1,,,,,,,,


In [25]:
# Determine the column number to extract the same compositions from the "SciGK" file

slice_1 = df_scigk.columns.get_loc('SIO2')
slice_2 = df_scigk.columns.get_loc('ZNO')
print(slice_1, slice_2)

4 15


In [26]:
# Extract the composition range from the "SciGK" file determined above and change the column to the same name as the new table

scigk_columns_to_check = ['KOD', 'GLASNO']

for i in range(slice_1, slice_2 + 1):
    scigk_columns_to_check.append(df_scigk.columns[i])
print(scigk_columns_to_check)
df_scigk_check = df_scigk[scigk_columns_to_check]
df_scigk_check.columns = main_element

df_scigk_check.iloc[:, :2] = df_scigk_check.iloc[:, :2].astype(np.int32)
df_scigk_check.iloc[:, 2:] = df_scigk_check.iloc[:, 2:].astype(np.float64)

df_scigk_check.head()

['KOD', 'GLASNO', 'SIO2', 'AL2O3', 'B2O3', 'CAO', 'K2O', 'NA2O', 'PBO', 'Li2O', 'MgO', 'SRO', 'BAO', 'ZNO']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


Unnamed: 0,Kod,GlasNo,SiO2,Al2O3,B2O3,CaO,K2O,Na2O,PbO,Li2O,MgO,SrO,BaO,ZnO
0,204,20000,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,205,20001,45.35,0.0,0.0,0.0,0.0,33.99,0.0,0.0,0.0,0.0,0.0,0.0
2,205,20002,39.98,0.0,0.0,0.0,0.0,33.39,0.0,0.0,0.0,0.0,0.0,0.0
3,205,20003,48.64,0.0,0.0,0.0,0.0,31.43,0.0,0.0,0.0,0.0,0.0,0.0
4,205,20004,48.24,0.0,0.0,0.0,0.0,30.6,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Determine the row number to compare

rows_to_see = 500

In [28]:
# 5 rows in the "GComp"

print('5 examples from Gcomp mol%')
df_SciGlass_mol_major.loc[int(rows_to_see):int(rows_to_see)+4]

5 examples from Gcomp mol%


Unnamed: 0,Kod,GlasNo,SiO2,Al2O3,B2O3,CaO,K2O,Na2O,PbO,Li2O,MgO,SrO,BaO,ZnO
500,233,123778,74.37,0.0,0.0,9.03,0.63,15.14,0.0,0.0,0.0,0.0,0.0,0.0
501,233,123779,74.06,0.0,0.0,8.99,0.63,15.08,0.0,0.0,0.0,0.0,0.0,0.0
502,233,123780,73.75,0.0,0.0,8.96,0.63,15.01,0.0,0.0,0.0,0.0,0.0,0.0
503,233,123781,73.45,0.0,0.0,8.92,0.62,14.95,0.0,0.0,0.0,0.0,0.0,0.0
504,233,123782,74.43,0.0,0.0,9.04,0.63,15.15,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# "5 rows in the SciGK"

print('5 examples from SciGK mol%')
df_scigk_check.loc[int(rows_to_see):int(rows_to_see)+4]

5 examples from SciGK mol%


Unnamed: 0,Kod,GlasNo,SiO2,Al2O3,B2O3,CaO,K2O,Na2O,PbO,Li2O,MgO,SrO,BaO,ZnO
500,233,123778,74.36,0.0,0.0,9.03,0.63,15.13,0.0,0.0,0.0,0.0,0.0,0.0
501,233,123779,74.05,0.0,0.0,8.99,0.62,15.07,0.0,0.0,0.0,0.0,0.0,0.0
502,233,123780,73.75,0.0,0.0,8.95,0.62,15.01,0.0,0.0,0.0,0.0,0.0,0.0
503,233,123781,73.44,0.0,0.0,8.91,0.62,14.95,0.0,0.0,0.0,0.0,0.0,0.0
504,233,123782,74.42,0.0,0.0,9.03,0.63,15.15,0.0,0.0,0.0,0.0,0.0,0.0


## Some disparities are found in the second decimal, but composition data look retrieved correctly from the "GComp" file, so save the file (it takes some time)

In [30]:
df_SciGlass_mol.to_csv('data_SciGlass/SciGlass_comp_mol.csv', index = None)