In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os.path
import os
import math
import copy
import statistics

pd.set_option('display.max_rows', None)

# Gather logs

### RAW data logs

In [2]:
dir_path = '../logs/'

In [3]:
times_dict = {
    'bucket_split_time': None,
    'bucket_sort_time': None,
    'bucket_to_array_time': None,
    'table_fill_time': None,
    'program_time': None
}

In [4]:
run_dict = {
    'algorithm': None,
    'points': None,
    'buckets': None,
    'threads': None,
    'times': []
}

In [5]:
all_run_dict_raw_data = []

In [6]:
df = pd.DataFrame()

for filename in os.listdir(dir_path):
    if filename.endswith('.log'):
        # Create new entry structures
        tmp_run_dict = copy.deepcopy(run_dict)
        tmp_times_dict = copy.deepcopy(times_dict)
        
        # Fill run_dict structure
        run_specification = filename.split('_')
        tmp_run_dict['algorithm'] = int(run_specification[1])
        tmp_run_dict['points'] = int(run_specification[3])
        tmp_run_dict['buckets'] = int(run_specification[5])
        tmp_run_dict['threads'] = int(run_specification[7][0])
        
        # Fill times_dict structure
        with open(dir_path+filename, 'r') as file_object:
            for line in file_object:
                if line.startswith('-'):
                    continue
                elif line.startswith('program_time'):
                    line_words = line.split(' ')
                    tmp_times_dict[line_words[0]] = line_words[1]
                    tmp_run_dict['times'].append(copy.deepcopy(tmp_times_dict))
                else:
                    line_words = line.split(' ')
                    tmp_times_dict[line_words[0]] = line_words[1]

        all_run_dict_raw_data.append(copy.deepcopy(tmp_run_dict))

In [7]:
all_run_dict_raw_data[:3]

[{'algorithm': 2,
  'points': 4000000,
  'buckets': 4000,
  'threads': 1,
  'times': [{'bucket_split_time': '0.290938',
    'bucket_sort_time': '0.964506',
    'bucket_to_array_time': '0.077626',
    'table_fill_time': '0.9145',
    'program_time': '2.26512'},
   {'bucket_split_time': '0.270698',
    'bucket_sort_time': '0.965475',
    'bucket_to_array_time': '0.078212',
    'table_fill_time': '0.738281',
    'program_time': '2.06971'},
   {'bucket_split_time': '0.282633',
    'bucket_sort_time': '0.965773',
    'bucket_to_array_time': '0.079645',
    'table_fill_time': '0.737586',
    'program_time': '2.08202'},
   {'bucket_split_time': '0.288938',
    'bucket_sort_time': '0.963611',
    'bucket_to_array_time': '0.122694',
    'table_fill_time': '0.742299',
    'program_time': '2.14572'},
   {'bucket_split_time': '0.339577',
    'bucket_sort_time': '0.962577',
    'bucket_to_array_time': '0.077254',
    'table_fill_time': '0.972534',
    'program_time': '2.37226'},
   {'bucket_split_t

### Mean and std_dev data logs

In [8]:
run_dict_mean_times = {
    'algorithm': None,
    'points': None,
    'buckets': None,
    'threads': None,
    'times': {
        'bucket_split': {
            'mean_time': None,
            'std_dev': None },
        'bucket_sort': {
            'mean_time': None,
            'std_dev': None },
        'bucket_to_array': {
            'mean_time': None,
            'std_dev': None },
        'table_fill': {
            'mean_time': None,
            'std_dev': None },
        'program_execution': {
            'mean_time': None,
            'std_dev': None }
    }
}

In [9]:
def get_measurements(run_measurement, type_of_measurement):
    return list(map(lambda x: float(x[type_of_measurement]), run_measurement['times']))

In [10]:
all_run_dict_mean_times = []

In [11]:
for elem in all_run_dict_raw_data:
    # Create new entry structures
    tmp_run_dict = copy.deepcopy(run_dict_mean_times)
    
    tmp_run_dict['algorithm'] = elem['algorithm']
    tmp_run_dict['points'] = elem['points']
    tmp_run_dict['buckets'] = elem['buckets']
    tmp_run_dict['threads'] = elem['threads']
    
    tmp_run_dict['times']['bucket_split']['mean_time'] = \
        statistics.mean(get_measurements(elem, 'bucket_split_time'))
    tmp_run_dict['times']['bucket_split']['std_dev'] = \
        statistics.stdev(get_measurements(elem, 'bucket_split_time'))
    tmp_run_dict['times']['bucket_sort']['mean_time'] = \
        statistics.mean(get_measurements(elem, 'bucket_sort_time'))
    tmp_run_dict['times']['bucket_sort']['std_dev'] = \
        statistics.stdev(get_measurements(elem, 'bucket_sort_time')) 
    tmp_run_dict['times']['bucket_to_array']['mean_time'] = \
        statistics.mean(get_measurements(elem, 'bucket_to_array_time'))
    tmp_run_dict['times']['bucket_to_array']['std_dev'] = \
        statistics.stdev(get_measurements(elem, 'bucket_to_array_time')) 
    tmp_run_dict['times']['table_fill']['mean_time'] = \
        statistics.mean(get_measurements(elem, 'table_fill_time'))
    tmp_run_dict['times']['table_fill']['std_dev'] = \
        statistics.stdev(get_measurements(elem, 'table_fill_time')) 
    tmp_run_dict['times']['program_execution']['mean_time'] = \
        statistics.mean(get_measurements(elem, 'program_time'))
    tmp_run_dict['times']['program_execution']['std_dev'] = \
        statistics.stdev(get_measurements(elem, 'program_time'))
    
    all_run_dict_mean_times.append(copy.deepcopy(tmp_run_dict))

In [12]:
all_run_dict_mean_times[:2]

[{'algorithm': 2,
  'points': 4000000,
  'buckets': 4000,
  'threads': 1,
  'times': {'bucket_split': {'mean_time': 0.2893858,
    'std_dev': 0.018647143324857507},
   'bucket_sort': {'mean_time': 0.9660247, 'std_dev': 0.008750692722420458},
   'bucket_to_array': {'mean_time': 0.0825488,
    'std_dev': 0.014122269535894166},
   'table_fill': {'mean_time': 0.840838, 'std_dev': 0.20002902517607013},
   'program_execution': {'mean_time': 2.1972709999999998,
    'std_dev': 0.2115758394340483}}},
 {'algorithm': 1,
  'points': 4000000,
  'buckets': 400000,
  'threads': 4,
  'times': {'bucket_split': {'mean_time': 0.9920906,
    'std_dev': 0.1164480603036774},
   'bucket_sort': {'mean_time': 0.1705379, 'std_dev': 0.01366672582466871},
   'bucket_to_array': {'mean_time': 0.0989042, 'std_dev': 0.01184526796094823},
   'table_fill': {'mean_time': 0.3507157, 'std_dev': 0.03267011324352036},
   'program_execution': {'mean_time': 1.817932,
    'std_dev': 0.11901165823378632}}}]

# Exercise 2

In [13]:
point_types = set(map(lambda x: x['points'], all_run_dict_mean_times))
buckets_types = set(map(lambda x: x['buckets'], all_run_dict_mean_times))
threads_types = set(map(lambda x: x['threads'], all_run_dict_mean_times))

In [14]:
speedup_dict = {
    'algorithm': None,
    'points': None,
    'buckets': None,
    'threads': {
        '2': None,
        '3': None,
        '4': None,
    }
}

### algorithm 2

In [15]:
algorithm_2_runs = list(filter(lambda x: x['algorithm'] == 2, all_run_dict_mean_times))

In [16]:
algorithm_2_speedup = []

In [17]:
for points in point_types:
    for bucket in buckets_types:
        selected_elems = list(filter(lambda x: x['points']==points and \
                                               x['buckets']==bucket, 
                                               algorithm_2_runs))
        
        if selected_elems:
            # Create new entry structures
            tmp_run_dict = copy.deepcopy(speedup_dict)

            tmp_run_dict['algorithm'] = selected_elems[0]['algorithm']
            tmp_run_dict['points'] = selected_elems[0]['points']
            tmp_run_dict['buckets'] = selected_elems[0]['buckets']

            time_1_threads = float(list(filter(lambda x: x['threads'] == 1, selected_elems))[0]['times']['program_execution']['mean_time'])
            time_2_threads = float(list(filter(lambda x: x['threads'] == 2, selected_elems))[0]['times']['program_execution']['mean_time'])
            time_3_threads = float(list(filter(lambda x: x['threads'] == 3, selected_elems))[0]['times']['program_execution']['mean_time'])
            time_4_threads = float(list(filter(lambda x: x['threads'] == 4, selected_elems))[0]['times']['program_execution']['mean_time'])
            
            tmp_run_dict['threads']['2'] = time_2_threads / time_1_threads
            tmp_run_dict['threads']['3'] = time_3_threads / time_1_threads
            tmp_run_dict['threads']['4'] = time_4_threads / time_1_threads
            
            algorithm_2_speedup.append(copy.deepcopy(tmp_run_dict))

In [18]:
algorithm_2_speedup[:1]

[{'algorithm': 2,
  'points': 4000000,
  'buckets': 400000,
  'threads': {'2': 0.9877617031862473,
   '3': 1.2719012390609696,
   '4': 1.0162724287195872}}]