# Coursework 1

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from functools import reduce

## Verification Data

Process the data using library functions for verification with the custom Hadoop implementation.

### Data Load

In [2]:
data_cols = [0, 1, 2, 8]
data_path = 'data/raw/200704hourly.txt'
df_weather = pd.read_csv(data_path, usecols=data_cols)

# remove spaces from the column names
df_weather.columns = df_weather.columns.str.replace(' ', '')

df_weather.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(1338620, 4)

In [3]:
df_weather.sort_values(by='YearMonthDay', inplace=True)
df_weather.reset_index(drop=True, inplace=True)
df_weather.head()

Unnamed: 0,WbanNumber,YearMonthDay,Time,DryBulbTemp
0,3011,20070401,50,32
1,53969,20070401,1845,-
2,53969,20070401,1945,-
3,53969,20070401,2045,-
4,53969,20070401,2145,-


In [4]:
# replace '-' with empty NaN
df_weather_clean = df_weather.replace('-', np.NaN)
df_weather_clean[df_weather_clean.DryBulbTemp.isna()]

Unnamed: 0,WbanNumber,YearMonthDay,Time,DryBulbTemp
1,53969,20070401,1845,
2,53969,20070401,1945,
3,53969,20070401,2045,
4,53969,20070401,2145,
5,53969,20070401,2245,
...,...,...,...,...
1338575,13902,20070430,923,
1338576,13902,20070430,911,
1338578,13902,20070430,835,
1338580,13902,20070430,745,


#### Remove Null Values

In [5]:
# drop null values
df_weather_clean = df_weather_clean.dropna()
df_weather_clean.YearMonthDay = df_weather_clean.YearMonthDay.astype('str')
df_weather_clean.DryBulbTemp = df_weather_clean.DryBulbTemp.astype('int')
df_weather_clean.shape

(1030138, 4)

In [6]:
df_weather_clean.describe()

Unnamed: 0,WbanNumber,DryBulbTemp
count,1030138.0,1030138.0
mean,37340.65,51.67273
std,33956.0,16.26283
min,3011.0,-67.0
25%,12909.0,40.0
50%,23190.0,52.0
75%,54834.0,64.0
max,94999.0,112.0


#### Group By Day

In [7]:
df_weather_clean.query('YearMonthDay == "20070401"').DryBulbTemp.min()

-13

In [8]:
df_weather_grouped = df_weather_clean[['YearMonthDay', 'DryBulbTemp']].groupby(by='YearMonthDay')
df_weather_grouped
df_weather_grouped.agg(['max', 'min', 'mean', 'median', 'var']).reset_index()

Unnamed: 0_level_0,YearMonthDay,DryBulbTemp,DryBulbTemp,DryBulbTemp,DryBulbTemp,DryBulbTemp
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean,median,var
0,20070401,92,-13,53.719618,54.0,220.167678
1,20070402,93,-13,55.066786,55.0,277.349574
2,20070403,93,-6,53.091015,54.0,347.583227
3,20070404,93,-15,45.9941,44.0,343.829264
4,20070405,94,-17,42.08565,39.0,293.334927
5,20070406,95,-5,39.700296,36.0,289.050116
6,20070407,96,-2,37.067186,34.0,228.100571
7,20070408,88,-6,39.363777,37.0,195.795372
8,20070409,89,-9,42.939378,42.0,184.965897
9,20070410,89,-15,45.496094,44.0,208.008217


## Mapper

In [9]:
class Mapper():
    """
    This class implements the logic for the mapping functionality.
    """
    def map_input(self, input_stream) -> list:
        """
        Map each item in the input stream to the output. The output is written to stdout.

        Parameters
        ----------
        input_stream : iterable
            The input stream to process.
        """
        output = []

        for item in input_stream:
            mapped_item = self.map(item)
            if mapped_item:
                print(mapped_item)

        return output

    def map(self, item:str) -> str:
        """
        Map the input string to the day the observations if for (key), and the observation temperature (value).

        Parameters
        ----------
        item : str
            The observation data to perform the mapping for.

        Returns
        -------
        output : dict
            A comma seperated string containing the `day` in the format YYYYMMDD and the 'temperature` as an integer.
        """
        # ignore the file headers
        if item.startswith('Wban Number'):
            return None

        # tokenize the input line
        tokens = item.split(',')

        # get the day value
        day_value = tokens[1].strip()

        # get the temperature
        temperature_value = tokens[8].strip()
        
        # do not process empty temperature values
        if temperature_value == '-':
            return None
        else:
            temperature_value = int(temperature_value)

        # return the key and value as a comma seperated string
        return '%s,%s' % (day_value, temperature_value)

# test the mapper with a text file
with open('data/raw/sample.txt') as input_file:
    mapper = Mapper()
    mapper.map_input(input_file)

20070401,32
20070401,32
20070401,32
20070403,34
20070401,34
20070401,32
20070402,34
20070401,37
20070401,41
20070401,45
20070401,50
20070401,52
20070402,55
20070403,-54
20070402,54
20070402,54
20070402,52
20070402,50
20070402,46
20070402,45


In [10]:
sample_path = data_path #'data/raw/sample.txt
with open(sample_path) as input_file:
    mapper = Mapper()
    output = list(map(mapper.map, input_file))

output

[None,
 '20070401,32',
 '20070401,32',
 '20070401,32',
 '20070401,34',
 '20070401,34',
 '20070401,32',
 '20070401,34',
 '20070401,37',
 '20070401,41',
 '20070401,45',
 '20070401,46',
 '20070401,50',
 '20070401,52',
 '20070401,55',
 '20070401,54',
 '20070401,54',
 '20070401,54',
 '20070401,52',
 '20070401,50',
 '20070401,46',
 '20070401,45',
 '20070401,43',
 '20070401,43',
 '20070401,43',
 '20070402,46',
 '20070402,45',
 '20070402,46',
 '20070402,45',
 '20070402,43',
 '20070402,41',
 '20070402,43',
 '20070402,45',
 '20070402,46',
 '20070402,50',
 '20070402,52',
 '20070402,54',
 '20070402,55',
 '20070402,54',
 '20070402,54',
 '20070402,54',
 '20070402,52',
 '20070402,52',
 '20070402,50',
 '20070402,50',
 '20070402,45',
 '20070402,45',
 '20070402,45',
 '20070402,43',
 '20070403,41',
 '20070403,37',
 '20070403,36',
 '20070403,36',
 '20070403,36',
 '20070403,34',
 '20070403,36',
 '20070403,41',
 '20070403,48',
 '20070403,52',
 '20070403,54',
 '20070403,55',
 '20070403,55',
 '20070403,55',
 

## Reducer

In [33]:
class ReducerValues():
    """
    Helper class to hold the values calculated by the reducer
    """
    def __init__(self, day:str) -> None:
        self.day = day
        self.max = None
        self.min = None
        self.values = []

    def add_value(self, value:int) -> None:
        """
        Add a value to the value collection
        """
        # add the value to the collection
        self.values.append(value)

        # update the max and min values
        if self.max is None:
            # initialize the max and min values
            self.max = value
            self.min = value
        else:
            if value > self.max: self.max = value
            if value < self.min: self.min = value

    def print_output(self) -> None:
        """
        Print the day values to stdout.
        """
        print('%s,%d,%d' % (
            self.day, 
            self.max, 
            self.min))

class Reducer():
    """
    This class contains the logic to summarize the temperature observations by day.
    """
    def reduce_input(self, input_stream):
        """
        Reduce the output from the mapper to calculate the max, min, mean, median, and variance per day.

        Parameters
        ----------
        input_stream : iterable
            The input stream to process.

        """
        current_day = ReducerValues(None)

        for item in input_stream:
            if item:
                # get the day and temperature value
                day_value, temperature_value = item.split(',')
                temperature_value = int(temperature_value)

                if current_day.day == day_value:
                    current_day.add_value(temperature_value)
                else:
                    # if the current day exist show the output
                    if current_day.day:
                        current_day.print_output()

                    # the current day have changed, create the new day
                    current_day = ReducerValues(day_value)
                    current_day.add_value(temperature_value)

        # print the last day processed
        if current_day.day == day_value:
            current_day.print_output()

with open(data_path) as input_file:
    mapper = Mapper()
    reducer = Reducer()
    
    # perform the mapping task
    mapper_output = map(mapper.map, input_file)

    # remove null values and sort the mapped output
    mapper_output = list(filter(None, mapper_output))
    mapper_output.sort()
    #print('len:', len(mapper_output))

    # perform the reduce function
    reducer.reduce_input(mapper_output)

20070401,92,-13
20070402,93,-13
20070403,93,-6
20070404,93,-15
20070405,94,-17
20070406,95,-5
20070407,96,-2
20070408,88,-6
20070409,89,-9
20070410,89,-15
20070411,96,-14
20070412,91,-16
20070413,98,-16
20070414,93,-11
20070415,90,-12
20070416,90,-13
20070417,90,-15
20070418,94,4
20070419,92,-4
20070420,93,-15
20070421,96,-11
20070422,89,1
20070423,112,1
20070424,99,12
20070425,96,3
20070426,96,0
20070427,102,-1
20070428,106,-11
20070429,100,-4
20070430,101,-67
