# csv module

In [206]:
#!/usr/bin/env python
"""
Your task is to process the supplied file and use the csv module to extract data from it.
The data comes from NREL (National Renewable Energy Laboratory) website. Each file
contains information from one meteorological station, in particular - about amount of
solar and wind energy for each hour of day.

Note that the first line of the datafile is neither data entry, nor header. It is a line
describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries).
You can use the csv modules "reader" method to get data in such format.
Another useful method is next() - to get the next line from the iterator.
You should only change the parse_file function.
"""
import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'r') as f:
        r = csv.reader(f, delimiter=',')
        #r = csv.DictReader(f) 
        cout = 0
        for line in r:      
            if cout == 0:
                name = line[1]
                print(name)
                cout += 1   
                
    with open(datafile,'r') as f:
        next(f) # skip header row
        next(f) 
        r = csv.reader(f, delimiter=',')
        for line in r: 
            data.append(line)
   
   
    # Do not change the line below
    return (name, data)

datafile = os.path.join(DATADIR, DATAFILE)
name, data = parse_file(datafile)
#print(header)
print("===============")
print(data[0][1])
print(data[2][0])
print(data[2][5])

def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()


MOUNTAIN VIEW MOFFETT FLD NAS
01:00
01/01/2005
2
MOUNTAIN VIEW MOFFETT FLD NAS


# Learn to clean data with package  xlrd

In [27]:
import xlrd

datafile = "2013_ERCOT_Hourly_Load_Data.xls"

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    
    # create python list
    data = [[sheet.cell_value(r, col) 
                for col in range(sheet.ncols)]
                    for r in range(sheet.nrows)]
    
    print("\nList Comprehension")
    print("data[3][2]:", data[3][2])
    print("\nCells in a nested loop:")
    for row in range(sheet.nrows):
        for col in range(sheet.ncols):
            if row == 50:
                print(sheet.cell_value(row, col)),
                
                
    ### other useful methods:            
    print("\nROWS, COLUMNS, and CELLS:")
    print("Number of rows in the steet:",)
    print(sheet.nrows)
    print("Type of data in cell(row 3, col 2):")
    print(sheet.cell_value(3, 2))
    print("Get a slice of values in column 3, from rows 1-3:")
    print(sheet.col_values(3, start_rowx=1, end_rowx=4))
    
    print("\nDATES:")
    print("Type of data in cell (row 1, col 0)")
    print(sheet.cell_type(1, 0))
    exceltime = sheet.cell_value(1, 0)
    print("Time in Excel format:")
    print(exceltime)
    print("Convert time to a Python datatime tuple, from the Excel float")
    print(xlrd.xldate_as_tuple(exceltime, 0))
    
    return data

data = parse_file(datafile)


List Comprehension
data[3][2]: 1036.0886969999988

Cells in a nested loop:
41277.083333333336
9238.737309999968
1438.2052799999994
1565.4428559999976
916.7083480000003
14010.903488000036
3027.9833399999993
6165.211119000006
1157.7416630000007
37520.93340400001

ROWS, COLUMNS, and CELLS:
Number of rows in the steet:
7296
Type of data in cell(row 3, col 2):
1036.0886969999988
Get a slice of values in column 3, from rows 1-3:
[1411.7505669999982, 1403.4722870000019, 1395.053150000001]

DATES:
Type of data in cell (row 1, col 0)
3
Time in Excel format:
41275.041666666664
Convert time to a Python datatime tuple, from the Excel float
(2013, 1, 1, 1, 0, 0)


## review
The number in first row 41277.083333333336 in "Cells in a naste loop" is a float number that represent the "date" in old xls file.

"data" is a python list. Its length is 7296.
code = len(data)

# Deal with coast columns by xlrd module

In [12]:
round(18779.02551, 10)

18779.02551

In [13]:
range(10)

range(0, 10)

In [15]:
min(range(10))

0

In [28]:
import xlrd

datafile = "2013_ERCOT_Hourly_Load_Data.xls"

def parse_file_sheet(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    
    return sheet

In [29]:
sheet = parse_file_sheet(datafile)

In [30]:
sheet.cell_value(3,2)

1036.0886969999988

In [41]:
print("The number of sheet columns:", sheet.ncols)
print("The number of sheet rows:", sheet.nrows)

The number of sheet columns: 10
The number of sheet rows: 7296


In [94]:
coast = sheet.col_values(1, start_rowx = 1, end_rowx = sheet.nrows)

In [72]:
round(max(coast),10), round(sum(coast)/len(coast),10)

(18779.02551, 10976.9334606798)

In [76]:
sheet.cell_value(1,0) # float number represent the date time

41275.041666666664

In [92]:
datetime = sheet.col_values(0, start_rowx = 1, end_rowx = sheet.nrows)

max_datetime = max(datetime)

print(xlrd.xldate_as_tuple(max_datetime, 0))

(2013, 11, 1, 0, 0, 0)


In [100]:
coast[coast.index(max(coast))]

18779.025510000003

In [102]:
xlrd.xldate_as_tuple(datetime[coast.index(max(coast))], 0)

(2013, 8, 13, 17, 0, 0)

In [103]:
coast = sheet.col_values(1, start_rowx = 1, end_rowx = sheet.nrows)
max_value = round(max(coast),10)
min_value = round(min(coast),10)
avgcoast = round(sum(coast)/len(coast),10)
    
datetime = sheet.col_values(0, start_rowx = 1, end_rowx = sheet.nrows)
maxtime = xlrd.xldate_as_tuple(datetime[coast.index(max(coast))], 0)
mintime = xlrd.xldate_as_tuple(datetime[coast.index(min(coast))], 0)
    

# JSON

In [114]:
"""
To experiment with this code freely you will have to run this code locally.
Take a look at the main() function for an example of how to use the code. We
have provided example json output in the other code editor tabs for you to look
at, but you will not be able to run any queries through our UI.
"""
import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"


# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    """
    This is the main function for making queries to the musicbrainz API. The
    query should return a json document.
    """
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print ("requesting", r.url)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    """
    This adds an artist name to the query parameters before making an API call
    to the function above.
    """
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    """
    After we get our output, we can use this function to format it to be more
    readable.
    """
    if type(data) == dict:
        print(json.dumps(data, indent=indent, sort_keys=True))
    else:
        print(data)


def main():
    """
    Below is an example investigation to help you get started in your
    exploration. Modify the function calls and indexing below to answer the
    questions on the next quiz.

    HINT: Note how the output we get from the site is a multi-level JSON
    document, so try making print statements to step through the structure one
    level at a time or copy the output to a separate output file. Experimenting
    and iteration will be key to understand the structure of the data!
    """

    # Query for information in the database about bands named Nirvana
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    pretty_print(results)

    # Isolate information from the 4th band returned (index 3)
    print("\nARTIST:")
    pretty_print(results["artists"][3])

    # Query for releases from that band using the artist_id
    artist_id = results["artists"][3]["id"]
    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]

    # Print information about releases from the selected band
    print("\nONE RELEASE:")
    pretty_print(releases[0], indent=2)

    release_titles = [r["title"] for r in releases]
    print("\nALL TITLES:")
    for t in release_titles:
        print(t) 

if __name__ == '__main__':
    main()

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json


HTTPError: 407 Client Error: Proxy Authentication Required ( Forefront TMG requires authorization to fulfill the request. Access to the Web Proxy filter is denied.  ) for url: http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json