In [None]:
'''Use minor manipulations to create synthetic data for 
developing the models. Manipulations are:

DS2 - Downsample the original spectra by a factor of 2
DS4 - Downsample the original spectra by a factor of 4
RAN - Add a random value [-5,5] to all features
DS2RAN - RAN applied to DS2
DS4RAN - RAN applied ot DS4
'''
#import pd, np, signal for data processing
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:

def trim_df(df):
  df.reset_index(inplace=True)

  #drop non-numeric columns from data
  dropcolumns=[]
  for i in df.columns.values:
    try:
      float(i)
    except ValueError:
      dropcolumns.append(i)
  df.drop(columns=dropcolumns,inplace=True)

  print(df.shape)
  print(df.head())
  return df

In [None]:
def create_ds_files(df,path,f_name):
  #Downsample by factors of 2 and 4 on original data
  ds2_df=df[[i for i in df.columns[range(0,len(df.columns),2)]]]
  ds4_df=df[[i for i in df.columns[range(0,len(df.columns),4)]]]

  #output to files
  ds2_new_file=path+f_name.split('.')[0]+'_ds2'+'.csv'
  ds4_new_file=path+f_name.split('.')[0]+'_ds4'+'.csv'
  ds2_df.to_csv(ds2_new_file)
  ds4_df.to_csv(ds4_new_file)
  print(ds2_new_file,'created')
  print(ds4_new_file,'created')
  return ds2_df,ds4_df

In [None]:
def create_r_files(df,ds2_df,ds4_df,path,f_name):
  #Random noise added to data (random values on a gaussian distribution, mu=0, std=5)
  ran_df=df+np.random.normal(0,5,df.shape)
  ran_ds2_df=ds2_df+np.random.normal(0,5,ds2_df.shape)
  ran_ds4_df=ds4_df+np.random.normal(0,5,ds4_df.shape)

  #output to files
  ran_new_file=path+f_name.split('.')[0]+'_r'+'.csv'
  rds2_new_file=path+f_name.split('.')[0]+'_r_ds2'+'.csv'
  rds4_new_file=path+f_name.split('.')[0]+'_r_ds4'+'.csv'
  ran_df.to_csv(ran_new_file)
  ran_ds2_df.to_csv(rds2_new_file)
  ran_ds4_df.to_csv(rds4_new_file)
  print(ran_new_file,'created')
  print(rds2_new_file,'created')
  print(rds4_new_file,'created\n\n\n')

In [None]:
def remove_repeat(file_ls,svd_fil):
  #try to read in the saved data
  try:
    saved_s=pd.read_csv(svd_fil,index_col=0,header=None,squeeze=True)
  except FileNotFoundError:
    print('History file not found, all data files being processed')
    return file_ls,False
  
  #somewhat clunky (but effective) method for checking if we've processed a file already
  temp_s=pd.Series(file_ls)
  tf_ls=temp_s.isin(saved_s)
  idx_ls=[]
  for i in range(0,len(tf_ls)):
    if tf_ls[i]:
      idx_ls.append(i)
  temp_s.drop(index=idx_ls,inplace=True)
  return temp_s.tolist(),True

In [None]:
def main():
  #import data from CSV files
  fin_path=r'/content/drive/My Drive/ML Spectroscopy/Data/Raw Data/'
  out_path=r'/content/drive/My Drive/ML Spectroscopy/Data/Synthetic Data/Basic Synthetic Data/'
  saved_data=r'/content/drive/My Drive/ML Spectroscopy/Programs/Data Processing/Saved Lists/Synth_created.csv'

  #Find all files in data directory
  from os import listdir
  import csv
  from os.path import isfile, join
  fname_ls = [f for f in listdir(fin_path) if isfile(join(fin_path, f))]
  fname_s=pd.Series(fname_ls)

  fname_ls,fflag=remove_repeat(file_ls=fname_ls,svd_fil=saved_data)

  if fname_ls:
    for fil in fname_ls:
      if fil.split('.')[-1]=='txt':
        df=pd.read_csv(fin_path+fil,delim_whitespace=True)
      else:
        df=pd.read_csv(fin_path+fil,delim_whitespace=False)

      df=trim_df(df=df)

      ds2_df,ds4_df=create_ds_files(df=df,path=out_path,f_name=fil)

      create_r_files(df=df,ds2_df=ds2_df,ds4_df=ds4_df,path=out_path,f_name=fil)

    if fflag:
      pd.Series(fname_ls).to_csv(saved_data,mode='a')
    else:
      pd.Series(fname_ls).to_csv(saved_data,mode='w')
  else:
    print('No new files')

main()

(4839, 2183)
   100.473  101.341  102.207  103.075  ...  1798.18  1798.88  1799.59  1800.29
0  1127.22  1136.45  1151.61  1163.44  ...  1156.03  1152.88  1146.21  1139.40
1  1143.00  1106.99  1138.98  1171.41  ...  1116.88  1118.86  1120.04  1117.67
2  1115.56  1125.27  1132.21  1139.89  ...  1139.65  1140.61  1140.97  1140.66
3  1121.00  1121.99  1113.99  1146.99  ...  1107.61  1107.06  1107.59  1112.55
4  1499.90  1495.82  1514.32  1533.66  ...  2304.71  2301.29  2279.83  2278.02

[5 rows x 2183 columns]
/content/drive/My Drive/ML Spectroscopy/Data/Synthetic Data/Basic Synthetic Data/granite0dust_test_015s_4839_ds2.csv created
/content/drive/My Drive/ML Spectroscopy/Data/Synthetic Data/Basic Synthetic Data/granite0dust_test_015s_4839_ds4.csv created
/content/drive/My Drive/ML Spectroscopy/Data/Synthetic Data/Basic Synthetic Data/granite0dust_test_015s_4839_r.csv created
/content/drive/My Drive/ML Spectroscopy/Data/Synthetic Data/Basic Synthetic Data/granite0dust_test_015s_4839_r_ds2.