In [1]:
"""
Librairie nécessaire:
 - numpy
 - pandas
 - warnings

Utilisation :
 - Remplir les chemins d'accès aux fichiers dans le script.
 - Exécuter le script

!!! ATTENTION !!!
Pour les données GNSS et CO2, faire une sélection de la totalité
des fichiers sur WebObs puis coller les dans un fichier txt.
Il n'a aucune modification à faire !

Pour la sismicité, UNIQUEMENT renommer le fichier de la sismicité
profonde en lui ajoutant '_deep' !

Les trous sont remplis par une interpolation linéaire !

Il n'y a pas d'extrapolation !

"""

import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# selection des fichiers
deformation = ['./GNSS/BASELINES_GAMIT_BOMG_all.txt',
			   './GNSS/BASELINES_GAMIT_FOAG_all.txt',
			   './GNSS/BASELINES_GAMIT_PRAG_all.txt',
			   './GNSS/BASELINES_GIPSYX_SNEG_all.txt']

sismicite = ['./SISMO/MC3_dump_daily_total.csv',
			 './SISMO/MC3_dump_daily_total_deep.csv']

geochimie = ['./CO2/RCSPCRN_all.txt']

# importations et mise en forme des fichiers de déformation
n_cols = 0
files = []
start, stop = [], []
for i in range(len(deformation)):
	file = open(deformation[i], 'r')
	lines = file.readlines()[12:]
	file.close()
	lines[0] = lines[0][1:-1]
	sub = np.array(list(lines[0]), dtype=object)
	sub = np.split(sub, np.where(sub == ' ')[0])
	sub[0] = sub[0].sum()
	for c in range(1, len(sub)):
		sub[c] = sub[c][1:].sum()

	lines[0] = sub
	for j in range(1, len(lines)):
		sub = np.array(list(lines[j][:-1]), dtype=object)
		sub = np.split(sub, np.where(sub == ' ')[0])
		sub[0] = int(sub[0].sum())
		for c in range(1, len(sub)):
			sub[c] = sub[c][1:].sum()

		lines[j] = sub

	df = pd.DataFrame(data=np.array(lines[1:], dtype=float), columns=lines[0])
	df['day'] = df['dd']
	df['month'] = df['mm']
	df['year'] = df['yyyy']
	df['date'] = pd.to_datetime(df[['day', 'month', 'year']], format='%d/%m/%Y')
	start.append(np.min(df['date']))
	stop.append(np.max(df['date']))
	df = df.drop(columns='day')
	df = df.drop(columns='dd')
	df = df.drop(columns='month')
	df = df.drop(columns='mm')
	df = df.drop(columns='year')
	df = df.drop(columns='yyyy')
	df = df.drop(columns='HH')
	df = df.drop(columns='MM')
	df = df.drop(columns='SS')
	n_cols += (len(df.columns)-1)

	cname = np.array(list(df.columns))
	cname = cname[cname != 'date']

	files.append(df)

# importations et mise en forme des fichiers de sismicité
for i in range(len(sismicite)):
	df = pd.read_csv(sismicite[i], skiprows=2, sep=';')
	dates = []
	for j, loc in enumerate(df['#YYYY-mm-dd Daily_Total(#)']):
		dates.append(loc[:4]+'/'+loc[5:7]+'/'+loc[8:])

	df['#YYYY-mm-dd Daily_Total(#)'] = dates
	df['date'] = pd.to_datetime(df['#YYYY-mm-dd Daily_Total(#)'], format='%Y/%m/%d')
	start.append(np.min(df['date']))
	stop.append(np.max(df['date']))

	if 'deep' in sismicite[i]:
		df['sismo_daily_deep'] = df['Daily_Count']
		#df['Daily_Moment(N.m)_deep'] = df['Daily_Moment(N.m)']
		#df['Daily_Energy(J)_deep'] = df['Daily_Energy(J)']
	else:
		df['sismo_daily'] = df['Daily_Count']
		#df['Daily_Moment(N.m)_sommit'] = df['Daily_Moment(N.m)']
		#df['Daily_Energy(J)_sommit'] = df['Daily_Energy(J)']

	df = df.drop(columns='#YYYY-mm-dd Daily_Total(#)')
	df = df.drop(columns='Daily_Count')
	df = df.drop(columns='Daily_Moment(N.m)')
	df = df.drop(columns='Daily_Energy(J)')
	n_cols += (len(df.columns)-1)

	files.append(df)

# importations et mise en forme des fichiers de géochimie
for i in range(len(geochimie)):
	file = open(geochimie[i], 'r')
	lines = file.readlines()[12:]
	file.close()
	lines[0] = [lines[0][1:5], lines[0][6:8], lines[0][9:11], lines[0][12:14],
				lines[0][15:17], lines[0][18:20], lines[0][21:35], lines[0][36:51],
				lines[0][52:70], lines[0][71:90], lines[0][91:110], lines[0][111:129],
				lines[0][130:-1]]

	for j in range(1, len(lines)):
		sub = np.array(list(lines[j][:-1]), dtype=object)
		sub = np.split(sub, np.where(sub == ' ')[0])
		sub[0] = int(sub[0].sum())
		sub[1] = int(sub[1].sum())
		sub[2] = int(sub[2].sum())
		sub[3] = int(sub[3].sum())
		sub[4] = int(sub[4].sum())
		sub[5] = int(sub[5].sum())
		for c in range(6, len(sub)):
			sub[c] = sub[c][1:].sum()

		lines[j] = sub

	df = pd.DataFrame(data=np.array(lines[1:], dtype=float), columns=lines[0])
	df['day'] = df['dd']
	df['month'] = df['mm']
	df['year'] = df['yyyy']
	df['date'] = pd.to_datetime(df[['day', 'month', 'year']], format='%d/%m/%Y')
	start.append(np.min(df['date']))
	stop.append(np.max(df['date']))
	df = df.drop(columns='day')
	df = df.drop(columns='dd')
	df = df.drop(columns='month')
	df = df.drop(columns='mm')
	df = df.drop(columns='year')
	df = df.drop(columns='yyyy')
	df = df.drop(columns='HH')
	df = df.drop(columns='MM')
	df = df.drop(columns='SS')
	df = df.drop(columns='CO2 raw(% mol)')
	df = df.drop(columns='CO2 filter7(% mol)')
	df = df.drop(columns='CO2 filter15(% mol)')
	df = df.drop(columns='CO2 filter31(% mol)')
	df = df.drop(columns='CO2 filterSG 1(au)')
	df = df.drop(columns='CO2 filterSG 2(au)')
	n_cols += (len(df.columns)-1)

	files.append(df)

# regroupement en un seul DataFrame
start, stop, cutt = np.min(start), np.max(stop), np.min(stop)
all_in_one = pd.DataFrame()
interpol_bool = pd.DataFrame()
timing = pd.date_range(start, stop, freq='D').to_numpy().astype('datetime64[D]')
all_in_one['date'] = timing
interpol_bool['date'] = timing
empty = np.zeros((timing.astype(int)[-1]+1, n_cols))+np.nan
starting = timing.astype(int)[0]
idx = 0
for file in files:
	uniq_values, uniq_index = np.unique(file['date'].to_numpy().astype('datetime64[D]').astype(int),
										return_index=True)

	cname = np.array(list(file.columns))
	cname = cname[cname != 'date']
	for j, col in enumerate(cname):
		the_col = file[col].to_numpy()
		empty[uniq_values, idx] = the_col[uniq_index]
		all_in_one[col] = empty[starting:, idx]
		interpol_bool[col] = np.isnan(empty[starting:, idx])
		idx += 1

interpol_bool = interpol_bool[interpol_bool['date'] <= cutt]
all_in_one = all_in_one[all_in_one['date'] <= cutt]

# sauvegarde du fichier indiquant où se trouvent les valeurs manquantes avant interpolation
interpol_bool.to_csv('interpolation_position.csv', index=False)

# interpolation linéaire
all_in_one = all_in_one.interpolate(method='linear', limit_direction='forward', axis=0)

# sauvegarde du fichiers de données
all_in_one.to_excel('data_in.xlsx', index=False)