# Data Files and Summary Statistics

In [3]:
import csv

%precision 2

with open('mpg.csv') as csvfile:
    mpg = list(csv.DictReader(csvfile))
mpg[:3]

[{'\ufeff': '1',
  'manufacturer': 'audi',
  'model': 'a4',
  'displ': '1.8',
  'year': '1999',
  'cyl': '4',
  'trans': 'auto(l5)',
  'drv': 'f',
  'cty': '18',
  'hwy': '29',
  'fl': 'p',
  'class': 'compact'},
 {'\ufeff': '2',
  'manufacturer': 'audi',
  'model': 'a4',
  'displ': '1.8',
  'year': '1999',
  'cyl': '4',
  'trans': 'manual(m5)',
  'drv': 'f',
  'cty': '21',
  'hwy': '29',
  'fl': 'p',
  'class': 'compact'},
 {'\ufeff': '3',
  'manufacturer': 'audi',
  'model': 'a4',
  'displ': '2',
  'year': '2008',
  'cyl': '4',
  'trans': 'manual(m6)',
  'drv': 'f',
  'cty': '20',
  'hwy': '31',
  'fl': 'p',
  'class': 'compact'}]

In [4]:
len(mpg)

234

In [5]:
mpg[0].keys()

dict_keys(['\ufeff', 'manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty', 'hwy', 'fl', 'class'])

In [16]:
#sum the city mpg entry across all the dict. in our list
#and divide by the length of the list
sum(float(d['cty']) for d in mpg) / len(mpg)

16.86

In [18]:
#since we can find the average highway mpg across all the cars in our csv file
# it makes sense that the avg highway fuel economy is higher than in the city. 
sum(float(d['hwy']) for d in mpg) / len(mpg)

23.44

In [21]:
#Now lets look at a more complex example. Say we want to know what the avg city MPG is 
#grouped by the number of cylinders a car has.
#creating a set of values in the cylinder entry of the dict. will give us unique levels
#for a number of cylinders
cylinders = set(d['cyl'] for d in mpg)
cylinders 

{'4', '5', '6', '8'}

# We see that we have cars in our dataset with 4, 5, 6, and 8 cylinders. 

In [23]:
# First create an empty list where we'll store our calculation
CtyMpgByCyl = []

#Next, lets iterate over all the cylinder levels
for c in cylinders:
    sumMpg = 0 
    cyltypecount =0 
#Then we'll iterate over all the dictionaries
    for d in mpg:
        # If the cylinder level for the dict. we're on matches the cylinder lever we're calc. the
        # avg for, we add the mpg to our sumMpg variable and increment count
        if d['cyl'] == c:
            sumMpg += float(d['cty'])
            cyltypecount += 1
# After going through all the dictionaries, we perform the avg mpg calc. and append it to our list
    CtyMpgByCyl.append((c,sumMpg / cyltypecount))
# To make things clearer, we sort the list from lowest number of cylinders to highest.
CtyMpgByCyl.sort(key=lambda x: x[0])
CtyMpgByCyl

[('4', 21.01), ('5', 20.50), ('6', 16.22), ('8', 12.57)]

 # We can see that the city fuel economy appears to be decreasing as the number of cylinders increases.

# Suppose we're interested in finding the average highway mpg for the different vechile classes.

In [6]:
# Where d is a dummy variable
# ['class'] is the column in mpg
# mpg is our csv file
vehicleClass = set(d['class'] for d in mpg)

#output
vehicleClass

{'2seater', 'compact', 'midsize', 'minivan', 'pickup', 'subcompact', 'suv'}

# Similar to our last example, we iterate over all the vehicle classes then iterate over all dictionaries.

In [9]:
# Create an empty list
HwyMpgByClass = []

# Iterate over all vehicle classes
for vc in vehicleClass: 
    # Sum the hwy mpg per vehicle class
    summpg = 0
    # Counts the number of vehicles per class
    vclasscount = 0 
    for d in mpg:
    # if the vehicle class for the dictionary matches the vehicle we're calculating
    # i.e, {'2seater', 'compact',...}
    # the average highway mpg for
        if d['class'] == vc:
            # then we add the value to our total(summpg)
            summpg += float(d['hwy'])
       # Increment the count per vehicle class
            vclasscount += 1
    #The average calculation and append to our list
    # (Every vehicle class + total hwy mpg) / ( Num of vehicle classes)
    HwyMpgByClass.append((vc, summpg / vclasscount))
    
# Sort the list from lowest mpg to highest mpg
# x is a variable being passed as an argument to the lambda function
# x[1] is referring to the second element of the sublist i.e., for d in mpg
# Thus x[1] refers to the element at index 1 of the sublist, not the second element of the list.

# 'key=lambda x: x[1]' tells the sort func. to use the second element(item) of each sublist x[1]
# as the key for sorting the entire list HwyMpgByClass. Sorting by mpg
HwyMpgByClass.sort(key=lambda x: x[1])
HwyMpgByClass

[('pickup', 16.88),
 ('suv', 18.13),
 ('minivan', 22.36),
 ('2seater', 24.80),
 ('midsize', 27.29),
 ('subcompact', 28.14),
 ('compact', 28.30)]

 # Based on our analysis pickup truck has the worst fuel economy and compact has the best.