In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
from tqdm import tqdm

In [2]:
def get_mean_angle_and_confidence_interval(filename):
    data = pd.read_csv(filename, header=None)
    mean_angle = (data.iloc[:, 1:].sum(axis=1) / data[0]).fillna(0)

    confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum() 
                                   / data.iloc[i, 0] for i in range(len(data))])
    
    max_interval = np.nanmax(confidence_interval)
    # if data.iloc[i, 0] == 1, then set confidence interval to max_interval
    confidence_interval = np.where(data[0] == 1, max_interval, confidence_interval)

    # nan means 0 samples, so we set the confidence interval to be 10 times the max interval
    confidence_interval = np.nan_to_num(confidence_interval, nan=max_interval*10)

    confidence_interval = pd.Series(confidence_interval)

    return mean_angle, confidence_interval

In [3]:
# get data
train_data_files = glob.glob(os.path.join("../data/train_thetas/", "*.csv"))
train_data_files.sort(key=lambda f: int(re.sub('\D', '', f)))

In [4]:
# save train_data_means_and_confidence as csv files
output_folder = "../data/train_thetas_means_and_confidence/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for i, file in enumerate(tqdm(train_data_files)):
    mean, confidence = get_mean_angle_and_confidence_interval(file)
    mean_and_confidence = pd.concat([mean, confidence], axis=1)
    mean_and_confidence.to_csv(output_folder + str(i) + ".csv", index=False, header=False)


  0%|          | 0/15000 [00:00<?, ?it/s]

  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_

In [5]:
test_data_files = glob.glob(os.path.join("../data/test_thetas/", "*.csv"))
test_data_files.sort(key=lambda f: int(re.sub('\D', '', f)))

In [6]:
# save test_data_means_and_confidence as csv files
output_folder = "../data/test_thetas_means_and_confidence/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for i, file in enumerate(tqdm(test_data_files)):
    mean, confidence = get_mean_angle_and_confidence_interval(file)
    mean_and_confidence = pd.concat([mean, confidence], axis=1)
    mean_and_confidence.to_csv(output_folder + str(i) + ".csv", index=False, header=False)

  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
  confidence_interval = np.sqrt([((mean_angle[i] - data.iloc[i, 1:data.iloc[i, 0]+1])**2).sum()
100%|██████████| 5000/5000 [39:32<00:00,  2.11it/s]
