In [2]:
import numpy as np 
import pandas as pd
from os import listdir
from os.path import isfile, join, isdir
import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
import pickle
import re
from datetime import datetime, timedelta
from tools import *

In [5]:
path = "Canbus_Data/Complete-HeaderFile_extendedBasedOn20173020605_V4.csv"
head = pd.read_csv(path, encoding= 'unicode_escape')

In [None]:
def transform_name(name):
    t = name[2:-4]
    date = datetime(int(t[:4]), 1, 1,int(t[7:9]), int(t[9:11])) + timedelta(int(t[4:7]) - 1)
    return date

def transform_date(date):
    date = date[:6] + date[8:-3]
    dt = datetime.strptime(date, '%d.%m.%y %H:%M:%S')
    return dt

In [6]:
tables, tables_list = get_all_paths()

### Check for empty files and missing columns

In [9]:
missingColumns = pickle.load(open("missingColumns.p", "rb"))
missingData = pickle.load(open("missingData.p", "rb"))

In [None]:
for x in missingData: #Fix small mistake in the earlier loop
    del missingColumns[x]

### How many columns do they files have?

In [11]:
columns_length = pickle.load(open("columnsLength.p", "rb"))

### Check for date differences

In [None]:
date_differences = []
for x in tables_list:
    try:
        a = pd.read_csv(x, encoding= 'unicode_escape')["t[s]"][0]
        one = transform_date(a)
        two = transform_name(x.split("/")[-1])
        if one > two: 
            diff = one - two 
        else: 
            diff = two - one
        date_differences.append(diff.total_seconds())
    except: 
        date_differences.append(-1) 
pickle.dump(date_differences, open("dateDifferences.p", "wb"))

In [None]:
differences = pickle.load(open("dateDifferences.p", "rb"))

### Detect dupliate lines

In [None]:
duplicates = pickle.load(open("detectedDuplicates.p", "rb"))

### Calculate the overlap

In [None]:
overlap = pickle.load(open("overlap.p", "rb"))

### Check for columns that do not exist in the header file

In [None]:
path = "Canbus_Data/Complete-HeaderFile_extendedBasedOn20173020605_V4.csv"
head = pd.read_csv(path, encoding= 'unicode_escape')
additional = {}
for x in tables_list:
    try:
        data = pd.read_csv(x, encoding= 'unicode_escape')
        missing = []
        for y in data.columns: 
            if y not in head.columns: 
                missing.append(y)
        additional[x] = missing
    except: 
        pass
pickle.dump(additional, open("additionalColumns.p", "wb"))

In [7]:
col = pickle.load(open("raw_stats/additionalColumns.p", "rb"))

### Get all missing names

In [None]:
#Get all broken column names
names = []
for key in col: 
    for value in col[key]:
        names.append(value)
temp = []
for x in names: 
    if "Unnamed" in x: 
        pass
    else: 
        temp.append(x)
names = list(set(temp))
pickle.dump(names, open("raw_stats/additionalColumnNames.p", "wb"))

In [3]:
names = pickle.load(open("raw_stats/additionalColumnNames.p", "rb"))

### Create a mapping from wrong to correct column name 

In [7]:
mapping = {}
for x in head.columns[1:-1]: 
      mapping[x] = []

In [8]:
manual = []
for x in names: 
    if "[_]" in x:
        try: 
            spotted = list(head).index(x[:-3] + "[°C]")
            mapping[x[:-3] + "[°C]"].append(x)
        except:
            manual.append(x)
    else:
        manual.append(x)

In [9]:
manual

['soil_temp_15cm_depth_3A06M[_]',
 'soil_temp_05cm_depth_3A06M[_]',
 'soil_temp_05cm_depth_3A06M[°C]',
 'soil_temp_15cm_depth_3A06M[°C]']

In [10]:
for x in mapping.keys():
    if "soil_temp_15cm_depth_3A06" in x: 
        print(x)

soil_temp_15cm_depth_3A06N[°C]


In [11]:
for x in mapping.keys():
    if "soil_temp_05cm_depth_3A06" in x: 
        print(x)

soil_temp_05cm_depth_3A06N[°C]


In [12]:
# manually add these 4:

mapping["soil_temp_05cm_depth_3A06N[°C]"].append(manual[1])
mapping["soil_temp_05cm_depth_3A06N[°C]"].append(manual[2])
mapping["soil_temp_15cm_depth_3A06N[°C]"].append(manual[0])
mapping["soil_temp_15cm_depth_3A06N[°C]"].append(manual[3])

In [13]:
pickle.dump(mapping, open("raw_stats/mappingWrongColumns.p", "wb"))  

### Everything table with more than 1 day time difference should be fixed.

In [None]:
# everything with more than one day difference should be corrected: 
differences = pickle.load(open("raw_stats/dateDifferences.p", "rb"))
corrected_list = []
for x in range(len(differences)):
    if differences[x] > 86400:
        correction = transform_doc_time(tables_list[x])
        corrected_list.append(tables_list[x])
        correction.to_csv("time_changes/" + tables_list[x].split("/")[-1])
pickle.dump(corrected_list, open("raw_stats/corrected_list.p", "wb"))  

### Get a full timeline
- Idea: Drop Seconds. And spread.
- A minute in the new set is then from 0. -59 of this minute.
- Should be a minimal generalization

In [5]:
timeLine = [transform_date(x) for x in pickle.load(open("data_transform/timeLine.p", "rb"))]

In [6]:
fullTime = (timeLine[-1] - timeLine[0]).days *24 * 60 + ((timeLine[-1] - timeLine[0]).seconds + 11) / 60
start = timeLine[0].replace(second=0)
completeLine = [start]
for x in range(1,int(fullTime)+1):
    completeLine.append(start + timedelta(minutes=x ) )
pickle.dump(completeLine, open("raw_stats/completeTime.p", "wb"))  

In [8]:
completeTime = pickle.load(open("raw_stats/completeTime.p", "rb"))

### Get a rounded timeline

In [3]:
timeLine = [transform_date(x) for x in pickle.load(open("data_transform/timeLine.p", "rb"))]
for x in range(len(timeLine)):
    timeLine[x] = timeLine[x].replace(second=0)
pickle.dump(timeLine, open("raw_stats/timeLineRounded.p", "wb"))  

### Gaps between timeline steps

In [7]:
timeLine = pickle.load(open("raw_stats/timeLineRounded.p", "rb"))

In [9]:
gapLine = []
for x in range(len(timeLine)-1): 
    gapLine.append(timeLine[x+1] - timeLine[x])
pickle.dump(gapLine, open("raw_stats/gapLine.p", "wb"))  