In [11]:
# Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the value of that field in the row.
# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces or newline characters.
# You can use the Python string method strip() to remove the extra whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!
import os

DATADIR = "/Users/josemanuelfernandez/Documents/Udacity/Data_Analyst/P3/Data-Wrangling/Lesson-1/"
DATAFILE = "beatles-diskography.csv"


def parse_file(datafile):
    data = []
    # 'rb' 'reads-binary': allows python to be more flexible in reading what's in the file.
    with open(datafile, "rb") as f:
        
        # The method readlines() reads until EOF using readline() and returns a list 
        # containing the lines.
        # Reads first line of file and split it using ","
        # This gives you a list of values you can use as 'keys items' for the 
        # data items you pull on from the data file later on.
        header = f.readline().split(",") # By getting the 'headers' you use them as 'key'
                                         # and the lines split (by commas) as values
        #print header # These are the keys
        
        counter = 0
        for line in f: # Loop over the lines on the file 'f'
            if counter == 10: # Counter-> Makes sure considers up to the 10th line (not inclusive)
                break # Break if we have read 10 lines
            
            #print line # Execute-> you only see each line separated by commas
            
            # For every line up to the 10th line 
            # We split the line again using the comma delimeter
            fields = line.split(',')
            #print fields # 'fields' are lists with the values
            entry = {} # Initialize an empty dictionary. The entry is going to be the data item
                       # that will construct using the 'keys' we got from the first 
                       # line of the file ('header') and the indiviual line we processed obove (field)
            
            # Constructs the dictionary with the key-value pais.
            # By using 'enumerate' we get an 'index' value in addition to a value 
            # for each item in the 'fields' list
            for i, value in enumerate(fields):
                #print i, value # i-> from 0-10 fields/variables (keys); 
                                # value-> each corresponding value per line
                entry[header[i].strip()] = value.strip() # Assigns appropiate 'value' corresponding to each
                                            # field ('header') for that the i_th key for that particular field
            # Use 'strip()' to clean any empty space
            
            data.append(entry)
            counter += 1
                    
    return data

#parse_file(DATAFILE)

## Test program
"""
def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}

    assert d[0] == firstline
    assert d[9] == tenthline

    
test()
"""

## Using CSV Module

In [6]:
import os
import pprint
import csv # https://docs.python.org/2/library/csv.html

DATADIR = "/Users/josemanuelfernandez/Documents/Udacity/Data_Analyst/P3/Data-Wrangling/Lesson-1/"
DATAFILE = "beatles-diskography.csv"


def parse_file(datafile):
    data = []
    n = 0
    # 'rb' 'reads-binary': allows python to be more flexible in reading what's in the file.
    with open(datafile, "rb") as sd:
        
        r = csv.DictReader(sd) # Assumes we want to read all our data into dictionaries
                                # Assumes first row contains headers and those names we
                                # want to use as "fields"
                                # Creates a dictionary for each row and keys will be the fields
                                # from the headers and values would be the rows associated with them.
        
        for line in r: # Loop through the dictionaries 'r'
            data.append(line)              
    return data

"""
# Print out all of those values
if __name__ == '__name__':
    datafile = os.path.join(DATADIR, DATAFILE)
    parse_csv(datafile)
    d = parse_csv(datafile)
    pprint.pprint(d)
"""

parse_file(DATAFILE)

[{'BPI Certification': 'Gold',
  'Label': 'Parlophone(UK)',
  'RIAA Certification': 'Platinum',
  'Released': '22 March 1963',
  'Title': 'Please Please Me',
  'UK Chart Position': '1',
  'US Chart Position': '\xe2\x80\x94'},
 {'BPI Certification': 'Platinum',
  'Label': 'Parlophone(UK)',
  'RIAA Certification': 'Gold',
  'Released': '22 November 1963',
  'Title': 'With the Beatles',
  'UK Chart Position': '1',
  'US Chart Position': '\xe2\x80\x94'},
 {'BPI Certification': '',
  'Label': 'Capitol(CAN)',
  'RIAA Certification': '',
  'Released': '25 November 1963',
  'Title': 'Beatlemania! With the Beatles',
  'UK Chart Position': '\xe2\x80\x94',
  'US Chart Position': '\xe2\x80\x94'},
 {'BPI Certification': '',
  'Label': 'Vee-Jay(US)',
  'RIAA Certification': '',
  'Released': '10 January 1964',
  'Title': 'Introducing... The Beatles',
  'UK Chart Position': '\xe2\x80\x94',
  'US Chart Position': '2'},
 {'BPI Certification': '',
  'Label': 'Capitol(US)',
  'RIAA Certification': '5xPla

## Working with XLS - Intro to XRLD

In [7]:
# Install the xlrd library locally: pip install xlrd
import xlrd #module reads xlsx or xls

datafile = "2013_ERCOT_Hourly_Load_Data.xls"


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    data = [[sheet.cell_value(r, col) 
                for col in range(sheet.ncols)] 
                    for r in range(sheet.nrows)]

    print "\nList Comprehension"
    print "data[3][2]:",
    print data[3][2]

    print "\nCells in a nested loop:"    
    for row in range(sheet.nrows):
        for col in range(sheet.ncols):
            if row == 50:
                print sheet.cell_value(row, col),


    ### other useful methods:
    print "\nROWS, COLUMNS, and CELLS:"
    print "Number of rows in the sheet:", 
    print sheet.nrows
    print "Type of data in cell (row 3, col 2):", 
    print sheet.cell_type(3, 2)
    print "Value in cell (row 3, col 2):", 
    print sheet.cell_value(3, 2)
    print "Get a slice of values in column 3, from rows 1-3:"
    print sheet.col_values(3, start_rowx=1, end_rowx=4)

    print "\nDATES:"
    print "Type of data in cell (row 1, col 0):", 
    print sheet.cell_type(1, 0)
    exceltime = sheet.cell_value(1, 0)
    print "Time in Excel format:",
    print exceltime
    print "Convert time to a Python datetime tuple, from the Excel float:",
    print xlrd.xldate_as_tuple(exceltime, 0)

    return data

data = parse_file(datafile)


List Comprehension
data[3][2]: 1036.088697

Cells in a nested loop:
41277.0833333 9238.73731 1438.20528 1565.442856 916.708348 14010.903488 3027.98334 6165.211119 1157.741663 37520.933404 
ROWS, COLUMNS, and CELLS:
Number of rows in the sheet: 7296
Type of data in cell (row 3, col 2): 2
Value in cell (row 3, col 2): 1036.088697
Get a slice of values in column 3, from rows 1-3:
[1411.7505669999982, 1403.4722870000019, 1395.053150000001]

DATES:
Type of data in cell (row 1, col 0): 3
Time in Excel format: 41275.0416667
Convert time to a Python datetime tuple, from the Excel float: (2013, 1, 1, 1, 0, 0)


### Reading Excel Files - Exercise

In [None]:
#!/usr/bin/env python
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""

import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    #sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]

    ### other useful methods:
    # print "\nROWS, COLUMNS, and CELLS:"
    # print "Number of rows in the sheet:", 
    # print sheet.nrows
    # print "Type of data in cell (row 3, col 2):", 
    # print sheet.cell_type(3, 2)
    # print "Value in cell (row 3, col 2):", 
    # print sheet.cell_value(3, 2)
    # print "Get a slice of values in column 3, from rows 1-3:"
    # print sheet.col_values(3, start_rowx=1, end_rowx=4)

    # print "\nDATES:"
    # print "Type of data in cell (row 1, col 0):", 
    # print sheet.cell_type(1, 0)
    # exceltime = sheet.cell_value(1, 0)
    # print "Time in Excel format:",
    # print exceltime
    # print "Convert time to a Python datetime tuple, from the Excel float:",
    # print xlrd.xldate_as_tuple(exceltime, 0)
    
    
    data = {
            'maxtime': (0, 0, 0, 0, 0, 0),
            'maxvalue': 0,
            'mintime': (0, 0, 0, 0, 0, 0),
            'minvalue': 0,
            'avgcoast': 0
    }
    return data


def test():
    open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)


test()