## Data Processing
This notebook process the OQMD data containing delta_e, volume_pa and bandgap containing 620K entries, 
after removing outliers [delta_e out outside (-20,5) and outside 5 std], we end up with 307K unique entries. 
We compute the physical attributes and elemental fractions, the whole processed data is saved at oqmd_all.csv. 
The train and test are available at respective csv files containing all, only physical or only fractions.

In [42]:
import re, numpy as np, os, sys, pandas
from pymatgen import Composition
import data_utils
import magpie
from pymatgen import Composition
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
from matminer.utils.conversions import str_to_composition
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
oqmd_data_path = '../training-data/oqmd_all-22Mar18.csv'
oqmd_data = pandas.read_csv(oqmd_data_path, sep=r'\s*', na_values= 'None')
oqmd_data.info()




split() requires a non-empty pattern match.


split() requires a non-empty pattern match.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620196 entries, 0 to 620195
Data columns (total 7 columns):
comp         620196 non-null object
energy_pa    620189 non-null float64
volume_pa    620189 non-null float64
magmom_pa    455663 non-null float64
bandgap      619819 non-null float64
delta_e      620196 non-null float64
stability    581531 non-null float64
dtypes: float64(6), object(1)
memory usage: 33.1+ MB


In [3]:
oqmd_data.describe()

Unnamed: 0,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability
count,620189.0,620189.0,455663.0,619819.0,620196.0,581531.0
mean,-5.434664,20.86548,0.406732,0.130374,0.007116,0.58325
std,2.615643,7.870184,0.556038,0.644729,1.996298,2.905439
min,-203.629754,2.72938,-4.248135,0.0,-198.69561,-198.683327
25%,-6.731937,15.3211,0.000346,0.0,-0.254396,0.207708
50%,-5.382519,19.0389,0.175463,0.0,0.110805,0.421494
75%,-4.076177,24.8237,0.717187,0.0,0.467793,0.713214
max,1122.552855,203.685,5.923581,18.437,1126.321181,1126.858389


In [4]:
oqmd_data.rename(columns={'comp':'composition'}, inplace=True)
oqmd_data[:2]

Unnamed: 0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability
0,Cs1Ho1S4Si1,-5.353489,27.1652,6.9e-05,3.024,-1.60894,-0.064029
1,Lu1,-4.511592,28.7838,0.046445,0.0,0.01259,0.01259


In [5]:
oqmd_data.query('delta_e > -20 and delta_e < 5', inplace=True)
oqmd_data = oqmd_data[~oqmd_data['delta_e'].isnull()]
oqmd_data.shape

(619991, 7)

In [6]:
%%time
oqmd_data['comp_obj'] = oqmd_data['composition'].apply(lambda x: Composition(x))

CPU times: user 18.3 s, sys: 67.5 ms, total: 18.4 s
Wall time: 18.4 s


In [7]:
%%time
oqmd_data['pretty_comp'] = oqmd_data['comp_obj'].apply(lambda x: x.reduced_formula)


No electronegativity for Ne. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.


No electronegativity for He. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.


No electronegativity for Ar. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.



CPU times: user 41.9 s, sys: 30.3 ms, total: 41.9 s
Wall time: 41.9 s


In [8]:
%%time
oqmd_data.sort_values('delta_e', ascending=True, inplace=True)
oqmd_data.drop_duplicates('pretty_comp', keep='first', inplace=True)
print('Reduced dataset to %d entries'%len(oqmd_data))

Reduced dataset to 341708 entries
CPU times: user 703 ms, sys: 15.9 ms, total: 719 ms
Wall time: 717 ms


In [9]:
oqmd_data['nelems'] = oqmd_data['comp_obj'].apply(lambda x: len(x))

In [10]:
oqmd_data = oqmd_data[np.abs(oqmd_data.delta_e - oqmd_data.delta_e.mean()) <= (5*oqmd_data.delta_e.std())]
oqmd_data.shape

(341688, 10)

In [11]:
oqmd_data.groupby('nelems').count()

Unnamed: 0_level_0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,comp_obj,pretty_comp
nelems,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,88,88,88,41,88,88,88,88,88
2,16250,16250,16250,9669,16219,16250,16240,16250,16250
3,263175,263174,263174,166354,263069,263175,249351,263175,263175
4,60519,60518,60518,54599,60507,60519,55179,60519,60519
5,1429,1429,1429,744,1428,1429,1428,1429,1429
6,202,202,202,79,202,202,202,202,202
7,25,25,25,10,25,25,25,25,25


In [12]:
oqmd_data.query('nelems > 1', inplace=True)
print (oqmd_data.groupby('nelems').count())
oqmd_data.shape

        composition  energy_pa  volume_pa  magmom_pa  bandgap  delta_e  \
nelems                                                                   
2             16250      16250      16250       9669    16219    16250   
3            263175     263174     263174     166354   263069   263175   
4             60519      60518      60518      54599    60507    60519   
5              1429       1429       1429        744     1428     1429   
6               202        202        202         79      202      202   
7                25         25         25         10       25       25   

        stability  comp_obj  pretty_comp  
nelems                                    
2           16240     16250        16250  
3          249351    263175       263175  
4           55179     60519        60519  
5            1428      1429         1429  
6             202       202          202  
7              25        25           25  


(341600, 10)

In [13]:
%%time
oqmd_data['comp_dict'] = oqmd_data['pretty_comp'].apply(lambda x: data_utils.parse_formula(x))

CPU times: user 4.78 s, sys: 7.98 ms, total: 4.79 s
Wall time: 4.79 s


In [14]:
%%time
elements_present = set()
for i, row in oqmd_data.iterrows():
    elements_present |= set(row['comp_dict'].keys())
print (elements_present)

{'U', 'Sm', 'As', 'Ta', 'Gd', 'Sr', 'H', 'Pd', 'Ag', 'Rh', 'Si', 'Au', 'P', 'Ni', 'Mo', 'Hf', 'B', 'C', 'Sn', 'Nd', 'Cu', 'Ca', 'Ru', 'Zr', 'Ho', 'Th', 'Xe', 'S', 'Ti', 'La', 'Tm', 'Eu', 'Ba', 'Kr', 'In', 'Dy', 'Pr', 'Rb', 'Cs', 'Er', 'Nb', 'I', 'Se', 'Zn', 'Sc', 'Bi', 'Pu', 'F', 'Mn', 'Ce', 'Sb', 'Pb', 'Pt', 'Al', 'Lu', 'O', 'Ge', 'Tb', 'Na', 'K', 'Cd', 'N', 'Hg', 'Np', 'Pm', 'Ga', 'Be', 'Re', 'Ir', 'Li', 'Mg', 'Cl', 'Co', 'Br', 'Os', 'W', 'Cr', 'Ac', 'Te', 'Tc', 'Y', 'V', 'Yb', 'Pa', 'Tl', 'Fe'}
CPU times: user 23.6 s, sys: 15.6 ms, total: 23.6 s
Wall time: 23.6 s


In [15]:
elements = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 
            'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb',
            'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 
            'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',
            'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa',
            'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt',
            'Ds', 'Rg', 'Cn']


elements_tl = ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K',
 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se',
 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In',
 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd',
 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au',
 'Hg', 'Tl', 'Pb', 'Bi', 'Ac','Th', 'Pa', 'U', 'Np', 'Pu']

elem_pos = dict()
i=0
for el in elements:
   elem_pos[el] = i
   i+=1

In [16]:
print (elements_present, len(elements_present))
print ([e for e in elements_present if e not in elements_tl])
print ([e for e in elements_tl if e not in elements_present])

{'U', 'Sm', 'As', 'Ta', 'Gd', 'Sr', 'H', 'Pd', 'Ag', 'Rh', 'Si', 'Au', 'P', 'Ni', 'Mo', 'Hf', 'B', 'C', 'Sn', 'Nd', 'Cu', 'Ca', 'Ru', 'Zr', 'Ho', 'Th', 'Xe', 'S', 'Ti', 'La', 'Tm', 'Eu', 'Ba', 'Kr', 'In', 'Dy', 'Pr', 'Rb', 'Cs', 'Er', 'Nb', 'I', 'Se', 'Zn', 'Sc', 'Bi', 'Pu', 'F', 'Mn', 'Ce', 'Sb', 'Pb', 'Pt', 'Al', 'Lu', 'O', 'Ge', 'Tb', 'Na', 'K', 'Cd', 'N', 'Hg', 'Np', 'Pm', 'Ga', 'Be', 'Re', 'Ir', 'Li', 'Mg', 'Cl', 'Co', 'Br', 'Os', 'W', 'Cr', 'Ac', 'Te', 'Tc', 'Y', 'V', 'Yb', 'Pa', 'Tl', 'Fe'} 86
[]
[]


In [17]:
oqmd_data['comp_fractions'] = oqmd_data['comp_dict'].apply(lambda x: data_utils.get_fractions(x))

In [18]:
print (oqmd_data[:1])
#oqmd_data['solution_term'] = oqmd_data['comp_obj'].apply(lambda x: compute_mixing_term(x))
print (oqmd_data.shape)
oqmd_data = oqmd_data[~oqmd_data['comp_fractions'].isnull()]
print (oqmd_data.shape)

       composition  energy_pa  volume_pa  magmom_pa  bandgap   delta_e  \
251840   Cs1F10Lu3  -6.356016    15.3841  -0.000026    7.611 -4.284843   

        stability     comp_obj pretty_comp  nelems  \
251840  -0.411923  (Cs, F, Lu)    CsLu3F10       3   

                                                comp_dict  \
251840  {'Cs': 0.07142857142857142, 'Lu': 0.2142857142...   

                                           comp_fractions  
251840  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.71428573...  
(341600, 12)
(341600, 12)


In [19]:
for i,e in enumerate(elements_tl):
    oqmd_data[e] = [ x[i] for x in oqmd_data['comp_fractions']]
oqmd_data.shape

(341600, 98)

In [20]:
oqmd_data.columns

Index(['composition', 'energy_pa', 'volume_pa', 'magmom_pa', 'bandgap',
       'delta_e', 'stability', 'comp_obj', 'pretty_comp', 'nelems',
       'comp_dict', 'comp_fractions', 'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F',
       'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V',
       'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br',
       'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag',
       'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr',
       'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
       'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi',
       'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'],
      dtype='object')

In [21]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [22]:
feature_labels = feature_calculators.feature_labels()

In [23]:
print (feature_labels, len(feature_labels))

['0-norm', '2-norm', '3-norm', '5-norm', '7-norm', '10-norm', 'minimum Number', 'maximum Number', 'range Number', 'mean Number', 'avg_dev Number', 'mode Number', 'minimum MendeleevNumber', 'maximum MendeleevNumber', 'range MendeleevNumber', 'mean MendeleevNumber', 'avg_dev MendeleevNumber', 'mode MendeleevNumber', 'minimum AtomicWeight', 'maximum AtomicWeight', 'range AtomicWeight', 'mean AtomicWeight', 'avg_dev AtomicWeight', 'mode AtomicWeight', 'minimum MeltingT', 'maximum MeltingT', 'range MeltingT', 'mean MeltingT', 'avg_dev MeltingT', 'mode MeltingT', 'minimum Column', 'maximum Column', 'range Column', 'mean Column', 'avg_dev Column', 'mode Column', 'minimum Row', 'maximum Row', 'range Row', 'mean Row', 'avg_dev Row', 'mode Row', 'minimum CovalentRadius', 'maximum CovalentRadius', 'range CovalentRadius', 'mean CovalentRadius', 'avg_dev CovalentRadius', 'mode CovalentRadius', 'minimum Electronegativity', 'maximum Electronegativity', 'range Electronegativity', 'mean Electronegativi

In [24]:
%%time
feature_calculators.featurize_dataframe(oqmd_data, col_id='comp_obj');

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=341600, style=ProgressStyle(descript…


CPU times: user 29.6 s, sys: 1.9 s, total: 31.5 s
Wall time: 4min 16s


Unnamed: 0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,comp_obj,pretty_comp,nelems,...,mean SpaceGroupNumber,avg_dev SpaceGroupNumber,mode SpaceGroupNumber,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,compound possible,max ionic char,avg ionic char
251840,Cs1F10Lu3,-6.356016,15.3841,-0.000026,7.611,-4.284843,-0.411923,"(Cs, F, Lu)",CsLu3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.214286,3.000000,True,0.921450,0.176525
240464,F3Tb1,-6.695208,11.7933,0.000322,8.317,-4.280576,-3.306395,"(F, Tb)",TbF3,2,...,59.750000,67.125000,15.0,2.000000,3.750000,0.000000,2.250000,True,0.874268,0.163925
270658,Er3F10K1,-6.367075,12.6559,0.000001,7.353,-4.270220,-0.027914,"(Er, F, K)",KEr3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.000000,2.571429,True,0.917619,0.177111
249661,F3Sc1,-6.944619,15.7811,0.000013,6.597,-4.269834,-3.171397,"(F, Sc)",ScF3,2,...,59.750000,67.125000,15.0,2.000000,3.750000,0.250000,0.000000,True,0.820234,0.153794
527724,F10Lu3Rb1,-6.346997,14.1302,-0.000014,7.371,-4.268079,-0.391680,"(F, Lu, Rb)",RbLu3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.214286,3.000000,True,0.917619,0.176228
483610,F9Rb1Th2,-6.670073,15.0591,-0.000028,6.783,-4.261621,-0.022216,"(F, Rb, Th)",RbTh2F9,3,...,67.833333,79.250000,15.0,1.916667,3.750000,0.333333,0.000000,True,0.917619,0.162375
250859,O2Th1,-9.367937,14.9368,-0.000087,4.867,-4.236914,-1.191793,"(O, Th)",ThO2,2,...,83.000000,94.666667,12.0,2.000000,2.666667,0.666667,0.000000,True,0.681744,0.151499
270266,F10K1Y3,-6.739374,13.2561,-0.000034,7.122,-4.235145,-0.177026,"(F, K, Y)",KY3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.214286,0.000000,True,0.917619,0.177686
265231,F9Na1Th2,-6.656020,13.2539,0.000015,6.719,-4.227890,-0.005018,"(F, Na, Th)",NaTh2F9,3,...,67.833333,79.250000,15.0,1.916667,3.750000,0.333333,0.000000,True,0.902278,0.161106
1028,Ca1F2,-5.849962,12.9555,,7.800,-4.219407,-0.945717,"(Ca, F)",CaF2,2,...,85.000000,93.333333,15.0,2.000000,3.333333,0.000000,0.000000,True,0.891402,0.198089


In [25]:
oqmd_data[feature_labels].shape

(341600, 145)

In [26]:
oqmd_data = oqmd_data[~oqmd_data[feature_labels].isnull().any(axis=1)]
print (oqmd_data.shape)

(341600, 243)


In [27]:
oqmd_data.describe()

Unnamed: 0,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,nelems,H,Li,Be,...,range SpaceGroupNumber,mean SpaceGroupNumber,avg_dev SpaceGroupNumber,mode SpaceGroupNumber,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,max ionic char,avg ionic char
count,341598.0,341598.0,231455.0,341450.0,341600.0,322425.0,341600.0,341600.0,341600.0,341600.0,...,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0
mean,-5.499689,22.038643,0.418654,0.140866,0.004932,0.485025,3.140026,0.003202,0.015583,0.013491,...,80.237114,184.003058,31.59719,164.049081,1.781546,0.625483,3.993334,3.594312,0.215172,0.039228
std,1.94036,7.951881,0.604427,0.676178,0.857872,1.110661,0.477863,0.038269,0.071415,0.067315,...,70.570815,40.193282,28.964286,72.217995,0.25957,0.837636,2.547032,3.351187,0.200724,0.039583
min,-13.575205,4.14911,-4.248135,0.0,-4.284843,-3.306395,2.0,0.0,0.0,0.0,...,0.0,4.25,0.0,2.0,0.235294,0.0,0.0,0.0,0.0,0.0
25%,-6.824986,16.3004,0.000264,0.0,-0.262579,0.182297,3.0,0.0,0.0,0.0,...,31.0,169.75,12.625,141.0,1.666667,0.0,2.0,0.0,0.072836,0.0121
50%,-5.421248,20.6516,0.161911,0.0,0.114121,0.39575,3.0,0.0,0.0,0.0,...,53.0,197.75,17.5,194.0,1.777778,0.25,3.75,3.5,0.15821,0.027507
75%,-4.091805,26.2049,0.711843,0.0,0.473441,0.670759,3.0,0.0,0.0,0.0,...,132.0,211.25,49.5,225.0,2.0,1.0,6.0,6.0,0.2774,0.050073
max,0.67297,203.685,5.035564,10.34,4.061899,198.836034,7.0,0.888889,0.833333,0.928571,...,227.0,229.0,113.5,229.0,2.0,5.333333,10.0,14.0,0.92145,0.230363


In [28]:
oqmd_data = oqmd_data[~oqmd_data['delta_e'].isnull()]
oqmd_data.shape

(341600, 243)

In [29]:
oqmd_data = oqmd_data[~oqmd_data['volume_pa'].isnull()]
oqmd_data.shape

(341598, 243)

In [30]:
oqmd_data = oqmd_data[~oqmd_data['bandgap'].isnull()]
oqmd_data.shape

(341450, 243)

In [31]:
properties = set(oqmd_data.columns.tolist()) - (set(feature_labels) | set(elements_present))
print (properties)

{'nelems', 'comp_fractions', 'bandgap', 'volume_pa', 'comp_obj', 'composition', 'magmom_pa', 'energy_pa', 'stability', 'delta_e', 'comp_dict', 'pretty_comp'}


In [32]:
for col in ['composition', 'comp_fractions', 'comp_dict', 'nelems', 'comp_obj']: oqmd_data.drop(col, axis=1, inplace=True)
oqmd_data.shape

(341450, 238)

In [33]:
properties = set(oqmd_data.columns.tolist()) - (set(feature_labels) | set(elements_present) |{'pretty_comp'})
print (properties)

{'bandgap', 'volume_pa', 'magmom_pa', 'energy_pa', 'stability', 'delta_e'}


In [34]:
oqmd_data = oqmd_data[['pretty_comp']+elements_tl+list(feature_labels)+list(properties)]

In [35]:
print (oqmd_data.columns)
print (oqmd_data.shape)

Index(['pretty_comp', 'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na',
       ...
       'avg f valence electrons', 'compound possible', 'max ionic char',
       'avg ionic char', 'bandgap', 'volume_pa', 'magmom_pa', 'energy_pa',
       'stability', 'delta_e'],
      dtype='object', length=238)
(341450, 238)


In [36]:
train_data, test_data = train_test_split(oqmd_data, test_size=0.1, random_state=1234567)

In [37]:
train_data.shape, test_data.shape

((307305, 238), (34145, 238))

In [39]:
train_data_fract = train_data[['pretty_comp']+elements_tl+list(properties)]
test_data_fract = test_data[['pretty_comp']+elements_tl+list(properties)]
train_data_phys = train_data[['pretty_comp']+list(feature_labels)+list(properties)]
test_data_phys = test_data[['pretty_comp']+list(feature_labels)+list(properties)]
print (train_data_fract.shape, test_data_fract.shape, train_data_phys.shape, test_data_phys.shape)

(307305, 93) (34145, 93) (307305, 152) (34145, 152)


In [40]:
train_data.to_csv('../training-data/train_set.csv', index=False)
train_data_fract.to_csv('../training-data/train_fract_set.csv', index=False)
train_data_phys.to_csv('../training-data/train_phys_set.csv', index=False)
test_data.to_csv('../training-data//test_set.csv', index=False)
test_data_fract.to_csv('../training-data/test_fract_set.csv', index=False)
test_data_phys.to_csv('../training-data/test_phys_set.csv', index=False)