# All steps in order from raw to processed

In [5]:
import math
import pickle
import re
from datetime import datetime, timedelta
from os import listdir
from os.path import isdir, isfile, join

sys.path.append("..")


import matplotlib.pyplot as plt
import numpy as np
import numpy_indexed as npi
from GP.tools import *

import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()

# Functions

In [11]:
def get_all_paths(): 
    path2 ="../Canbus_data/raw"
    years = sorted(listdir(path2))
    split = []
    for x in years: 
        split.append(sorted([f for f in listdir(path2 + "/" + x) if isdir(join(path2 + "/" + x, f))]))
    tables = []
    for x in range(len(years)):
        current_year = []
        for y in split[x]: 
            current_year.append(sorted([path2 + "/" + years[x] + "/" + y + "/" + f for f in listdir(path2 + "/" + years[x] + "/" + y) if isfile(join(path2 + "/" + years[x] + "/" + y, f))]))
        tables.append([item for sublist in current_year for item in sublist])
    tables_list = [item for sublist in tables for item in sublist]
    return tables,tables_list

In [None]:
def load_ts(ts_name):
    print("building timeseries: " + str(ts_name))
    print("loading rounded timestamps...")
    time = pickle.load(open("raw_stats/timeLineRounded.p", "rb"))
    print("loading columns information...")
    wrongColumns = pickle.load(open("raw_stats/mappingWrongColumns.p", "rb")) 
    allColumns = list(pd.read_csv("Canbus_Data/Complete-HeaderFile_extendedBasedOn20173020605_V4.csv", encoding= 'unicode_escape').columns)
    print("loading main data file...")
    data  = pd.read_csv("data_transform/restructured_raw/table_" + str(allColumns.index(ts_name)) + ".csv")
    addData = []
    for x in wrongColumns[ts_name]:  
        print("loading additional column: " + str(x) + "...")
        addData.append(pd.read_csv("data_transform/restructured_raw/fColumns/" + str(x) + ".csv" , encoding= 'unicode_escape'))
        addData[-1]["t[s]"] = pd.to_datetime(round_time(addData[-1]["t[s]"]))
    missingVals = sum(np.isnan(data[data.columns[0]]))
    print("Data has " + str(missingVals) + " rows that are  empty")
    for x in range(len(addData)):
        assert list(set(addData[x]["t[s]"]) - set(time)) == [], "Unknown time was discovered in additional column" 
        nonEmpty = len(addData[x]) - sum(np.isnan(addData[x][addData[x].columns[1]]))
        print("Additional column " + str(wrongColumns[ts_name][x]) + "has " + str(nonEmpty) + " non empty values")
    mergeOut = data.columns[0]
    data["t[s]"] = pd.to_datetime(time)
    for x in range(len(addData)):   # should be tested for multiple column inserts!
        data = data.merge(addData[x],left_on="t[s]", right_on = "t[s]",how= "outer")
        data[mergeOut] = np.where(np.isnan(data[wrongColumns[ts_name][x]]), data[mergeOut],data[wrongColumns[ts_name][x]])
    missingVals_2 = sum(np.isnan(data[mergeOut]))
    print("Data has " + str(missingVals_2) + " rows that are empty after merch (" + str(missingVals - missingVals_2) 
          + " values were added through wrong column merges).") 
    print("Spreading time...")
    data = spread_time(data[["t[s]",mergeOut]])
    print("len after merging and spreading: " + str(len(data)))
    print("dropping duplicate timestamps...")
    data.drop_duplicates(subset="t[s]", inplace=True, ignore_index=True)
    print("len after dropping: " + str(len(data)))
    print("Loading done.")
    return data

def load_277_278(ts_name):   #Bug catch. these variables does only exist with wrong names in the original data
    time = pickle.load(open("raw_stats/timeLineRounded.p", "rb"))
    wrongColumns = pickle.load(open("raw_stats/mappingWrongColumns.p", "rb")) 
    allColumns = list(pd.read_csv("Canbus_Data/Complete-HeaderFile_extendedBasedOn20173020605_V4.csv", encoding= 'unicode_escape').columns)
    data = pd.DataFrame(time, columns = ["t[s]"])
    addData = []
    for x in wrongColumns[ts_name]:  
        print("loading additional column: " + str(x) + "...")
        addData.append(pd.read_csv("data_transform/restructured_raw/fColumns/" + str(x) + ".csv" , encoding= 'unicode_escape'))
        addData[-1]["t[s]"] =  pd.to_datetime(addData[-1]["t[s]"], format="%d.%m.%Y %H:%M:%S.%f").dt.floor('Min')
    data = data.merge(addData[0],left_on="t[s]", right_on = "t[s]",how= "outer")
    data = data.merge(addData[1],left_on="t[s]", right_on = "t[s]",how= "outer")
    data[ts_name] = np.where(np.isnan(data[data.columns[2]]), data[data.columns[1]], data[data.columns[2]])
    data = spread_time(data[["t[s]",ts_name]])
    data.drop_duplicates(subset="t[s]", inplace=True)
    print("len after merging and spreading: " + str(len(data)))
    print("dropping duplicate timestamps...")
    data.drop_duplicates(subset="t[s]", inplace=True, ignore_index=True)
    print("len after dropping: " + str(len(data)))
    print("Loading done.")
    return data

# Basics

In [15]:
# All paths
tables,tables_list = get_all_paths()

In [33]:
allColumns = list(pd.read_csv("../Canbus_data/Complete-HeaderFile_extendedBasedOn20173020605_V4.csv", encoding= 'unicode_escape').columns)

## Transform the data into complete timeseries and correct time differences

In [5]:
which = np.concatenate(([allColumns[0]], allColumns[1:50]))    #I've split the loading into multiple blocks due to ram

In [6]:
# load a bundle of columns
missing = []
container = []
counter = 0
for x in tables_list: 
    counter += 1
    if counter % 1000 == 0: 
        print(counter)
    try: 
        toParse = []
        data  = pd.read_csv(x, encoding= 'unicode_escape')
        for x in which: 
            if x in data.columns: 
                toParse.append(x)
        container.append(data[toParse])
    except:
        container.append([])

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000


In [7]:
# Add the corrected tables were the time difference is too high between name and first index
timeProblems = pickle.load(open("raw_stats/corrected_list.p", "rb"))
for x in timeProblems: 
    where = tables_list.index(x)
    new = pd.read_csv("data_transform/time_changes/" + x.split("/")[-1], encoding= 'unicode_escape')
    toParse = []
    for x in which: 
        if x in new.columns: 
            toParse.append(x)
    container[where] = new[toParse]

In [15]:
for x in range(len(container)): 
    if len(container[x]) == 0: 
        container[x] = pd.DataFrame([])

In [18]:
# build a single list out of the container: 
table = pd.concat(container)

In [19]:
table.shape

(8522674, 50)

In [20]:
for x in range(0,50): 
    a = table[which[x+ 1:x + 2]].reset_index(drop=True)
    a.to_csv("data_transform/table_" + str(x+1) + ".csv",index = False)

In [22]:
#one time timeline
timeLine = []
for x in container: 
    if len(x) == 0 :
        pass
    else:
        timeLine.append(x["t[s]"])
timeLine = [item for sublist in timeLine for item in sublist]
pickle.dump(timeLine, open("data_transform/timeLine.p", "wb"))   

## Build all broken column names as tables

In [5]:
additional = pickle.load(open("raw_stats/additionalColumns.p", "rb"))

In [6]:
names = pickle.load(open("raw_stats/additionalColumnNames.p", "rb"))

In [7]:
filtered = {}
for key in additional:
    box = []
    for value in additional[key]:
        if "Unnamed" in value: 
            pass
        else: 
            box.append(value)
    if box: 
        filtered[key] = box

In [45]:
#load all errors
timeProblems = pickle.load(open("raw_stats/corrected_list.p", "rb"))
container = {}
counter = 0
for x in names: 
    container[x] = []
for x in list(filtered.keys()): 
    counter += 1
    if counter % 1000 == 0: 
        print(counter)
    if x not in timeProblems:
        data  = pd.read_csv(x, encoding= 'unicode_escape')
    else: 
        data  = pd.read_csv("data_transform/time_changes/" + x.split("/")[-1], encoding= 'unicode_escape')    
    for y in names: 
        if y in data.columns: 
            container[y].append(data[["t[s]", y]])

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000


In [46]:
for x in container.keys(): 
    pd.concat(container[x]).to_csv("data_transform/fColumns/" + str(x) + ".csv",index = False)

# To create the prepared dataset: 
    - run "load_ts" on any column name to get the final timeseries
    - run two simple filters
    - save the data and the filtered value indices

### load_ts performs
- Load main file with data including timer corrections based on file names
- Adds wrong column name files to the dataset
- Checks for some stats before merging
- Rounds time to seconds (00)
- Spreading the dataset to the full timescale
- filter duplicate timestamps
- Outputs data + time in final format
- some intermediate tests

In [3]:
relevant = []
for x in allColumns: 
    if 'spare_temp_OR_volt' in x: 
        pass
    elif 'underpress_suction_plates' in x: 
        pass
    elif 'underpress_suc_plates' in x: 
        pass
    elif "air_rel_humidiy_30cm" in x: 
        pass
    elif x == "t[s]":
        pass
    else:
        relevant.append(x) 

In [10]:
#redo the % variables with a different filter
redo = []
for x in allColumns: 
    if "air_rel_humidiy_30cm" in x:
        redo.append(x)

In [5]:
#redo the sur variables with a different filter
redo = []
for x in allColumns: 
    if "surface_temp_south_west" in x:
        redo.append(x)

In [5]:
relevant.index("soil_temp_15cm_depth_3A06N[°C]")

181

In [None]:
%%capture
for x in relevant[:180] + relevant[182:]: 
    corrected = {}
    ts1 = load_ts(x)
    outliers = threshold_filter(ts1,threshold=45)
    corrected["threshold"] = outliers.values
    set_outliers_nan(ts1,outliers)
    pickle.dump(ts1[x], open("data_transform/prepared_filtered/" + x + ".p", "wb"))
    pickle.dump(corrected, open("data_transform/prepared_filtered/filter_" + x + ".p", "wb"))

In [11]:
 #%%capture
for x in redo: 
    corrected = {}
    ts1 = load_ts(x)
    outliers = threshold_filter_2(ts1,threshold=(10,100))
    corrected["threshold"] = outliers.values
    set_outliers_nan(ts1,outliers)
    pickle.dump(ts1[x], open("data_transform/prepared_filtered/" + x + ".p", "wb"))
    pickle.dump(corrected, open("data_transform/prepared_filtered/filter_" + x + ".p", "wb"))

building timeseries: air_rel_humidiy_30cm_hei_2A01M[%]
loading rounded timestamps...
loading columns information...
loading main data file...
Data has 268295 rows that are  empty
Data has 268295 rows that are empty after merch (0 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426612
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: air_rel_humidiy_30cm_hei_2A02M[%]
loading rounded timestamps...
loading columns information...
loading main data file...
Data has 125444 rows that are  empty
Data has 125444 rows that are empty after merch (0 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426612
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: air_rel_humidiy_30cm_hei_2A03M[%]
loading rounded timestamps...
loading columns information...
loading main data file...
Data has 277779 rows that are  emp

In [7]:
 #%%capture 2
for x in redo: 
    corrected = {}
    ts1 = load_ts(x)
    outliers = threshold_filter_2(ts1,threshold=(-45,55))
    corrected["threshold"] = outliers.values
    set_outliers_nan(ts1,outliers)
    pickle.dump(ts1[x], open("data_transform/prepared_filtered/" + x + ".p", "wb"))
    pickle.dump(corrected, open("data_transform/prepared_filtered/filter_" + x + ".p", "wb"))

building timeseries: surface_temp_south_west_2A01M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A01M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 657396 rows that are  empty
Additional column surface_temp_south_west_2A01M[_]has 67840 non empty values
Data has 589556 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A02M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A02M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 514545 rows that are  empty
Additional column surface_temp_south_west_2A02M[_]has 67840 non empty values
Data has 446705 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A03M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A03M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 666880 rows that are  empty
Additional column surface_temp_south_west_2A03M[_]has 67840 non empty values
Data has 599040 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A04M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A04M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 711343 rows that are  empty
Additional column surface_temp_south_west_2A04M[_]has 67840 non empty values
Data has 643503 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A05M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A05M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 515757 rows that are  empty
Additional column surface_temp_south_west_2A05M[_]has 67840 non empty values
Data has 447917 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A06M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A06M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 769411 rows that are  empty
Additional column surface_temp_south_west_2A06M[_]has 67840 non empty values
Data has 701571 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A07M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A07M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1687829 rows that are  empty
Additional column surface_temp_south_west_2A07M[_]has 67840 non empty values
Data has 1619989 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A08M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A08M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 709733 rows that are  empty
Additional column surface_temp_south_west_2A08M[_]has 67840 non empty values
Data has 641893 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A09M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A09M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 810493 rows that are  empty
Additional column surface_temp_south_west_2A09M[_]has 67840 non empty values
Data has 742653 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A10M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A10M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 647332 rows that are  empty
Additional column surface_temp_south_west_2A10M[_]has 67840 non empty values
Data has 579492 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A11M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A11M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1014660 rows that are  empty
Additional column surface_temp_south_west_2A11M[_]has 67840 non empty values
Data has 946820 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A12M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A12M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1190419 rows that are  empty
Additional column surface_temp_south_west_2A12M[_]has 67840 non empty values
Data has 1122579 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A13M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A13M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1291370 rows that are  empty
Additional column surface_temp_south_west_2A13M[_]has 67840 non empty values
Data has 1223530 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A14M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A14M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1072413 rows that are  empty
Additional column surface_temp_south_west_2A14M[_]has 67840 non empty values
Data has 1004573 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A15M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A15M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1067540 rows that are  empty
Additional column surface_temp_south_west_2A15M[_]has 67840 non empty values
Data has 999700 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A16M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A16M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1255984 rows that are  empty
Additional column surface_temp_south_west_2A16M[_]has 58552 non empty values
Data has 1197432 rows that are empty after merch (58552 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426924
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A17M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A17M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1254822 rows that are  empty
Additional column surface_temp_south_west_2A17M[_]has 67840 non empty values
Data has 1186982 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A18M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A18M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1862915 rows that are  empty
Additional column surface_temp_south_west_2A18M[_]has 58552 non empty values
Data has 1804363 rows that are empty after merch (58552 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426924
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A19M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A19M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1095691 rows that are  empty
Additional column surface_temp_south_west_2A19M[_]has 67840 non empty values
Data has 1027851 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A20M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A20M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1084165 rows that are  empty
Additional column surface_temp_south_west_2A20M[_]has 67840 non empty values
Data has 1016325 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A21M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A21M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1393062 rows that are  empty
Additional column surface_temp_south_west_2A21M[_]has 67840 non empty values
Data has 1325222 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.
building timeseries: surface_temp_south_west_2A22M[°C]
loading rounded timestamps...
loading columns information...
loading main data file...
loading additional column: surface_temp_south_west_2A22M[_]...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Data has 1510713 rows that are  empty
Additional column surface_temp_south_west_2A22M[_]has 67840 non empty values
Data has 1442873 rows that are empty after merch (67840 values were added through wrong column merges).
Spreading time...
len after merging and spreading: 9426976
dropping duplicate timestamps...
len after dropping: 9393633
Loading done.


In [7]:
%%capture
#redo 277 and 278 with the new fix
corrected = {}
ts1 = load_277_278(allColumns[277])
outliers = threshold_filter(ts1)
corrected["threshold"] = outliers.values
set_outliers_nan(ts1,outliers)
pickle.dump(ts1[allColumns[277]], open("data_transform/prepared_filtered/" + allColumns[277] + ".p", "wb"))
pickle.dump(corrected, open("data_transform/prepared_filtered/filter_" + allColumns[277] + ".p", "wb"))
corrected = {}
ts1 = load_277_278(allColumns[278])
outliers = threshold_filter(ts1,threshold=45)
corrected["threshold"] = outliers.values
set_outliers_nan(ts1,outliers)
pickle.dump(ts1[allColumns[278]], open("data_transform/prepared_filtered/" + allColumns[278] + ".p", "wb"))
pickle.dump(corrected, open("data_transform/prepared_filtered/filter_" + allColumns[278] + ".p", "wb"))

In [10]:
pickle.dump(ts1["t[s]"], open("data_transform/prepared_filtered/" + "time" + ".p", "wb"))

In [1]:
#Construct the filter table: 

# Todo: 

- Check if transform counts merge correctly
- fix the warning
- make a better threshold filter

# Things to keep in mind:
- There are 2239 timestamps that are duplicated but have different values? Not a lot