In [76]:
import pandas as pd # load and manipulate data and for One-Hot Encoding
import numpy as np # calculate the mean and standard deviation
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap, BoundaryNorm
from statsmodels.graphics.tsaplots import plot_acf

In [77]:
# Load data
df = pd.read_csv(
    'full_data.v1.csv'
)
df.sample(5)

Unnamed: 0,trip,obs,country,city,mode,mode2,route,wo,ac,crowd,co2,lnco2,over
2950,172,12,1,2,3,2,6,2,1,1,793,6.675823,0
701,44,17,0,0,1,1,0,2,1,2,638,6.458338,0
1779,118,21,0,0,1,2,0,1,2,3,746,6.614726,0
3302,183,15,1,2,3,2,6,2,1,1,682,6.52503,0
5556,248,39,1,2,0,1,0,2,2,1,819,6.708084,0


In [78]:
df['mode2'] = np.where(df['mode2'] == 0, 'Outside',
                   np.where(df['mode2'] == 1, 'BRT',
                   np.where(df['mode2'] == 2, 'Bus', 'Subway')))

In [79]:
df['city'] = np.where(df['city'] == 0, 'Montreal',
                   np.where(df['city'] == 1, 'Mexico City', 'Puebla'))

In [80]:
df['crowd'] = np.where(df['crowd'] == 0, 'Outside',
                   np.where(df['crowd'] == 1, 'Not crowded', 
                   np.where(df['crowd'] == 2, 'Semi crowded', 'Crowded')))

In [81]:
df['wo'] = np.where(df['wo'] == 0, 'Outside',
                   np.where(df['wo'] == 1, 'Windows closed', 'Windows opened')) 

In [82]:
df['ac'] = np.where(df['ac'] == 0, 'Outside',
                   np.where(df['ac'] == 1, 'AC/Heating off', 'AC/Heating on')) 

In [83]:
df.sample(10)

Unnamed: 0,trip,obs,country,city,mode,mode2,route,wo,ac,crowd,co2,lnco2,over
4174,208,1,1,Mexico City,0,BRT,8,Windows opened,AC/Heating off,Not crowded,1110,7.012115,1
3063,176,1,1,Puebla,3,Bus,6,Windows opened,AC/Heating off,Crowded,908,6.811244,0
5143,220,35,1,Puebla,0,BRT,0,Windows closed,AC/Heating on,Not crowded,614,6.419995,0
4642,213,47,1,Mexico City,0,BRT,7,Windows opened,AC/Heating off,Semi crowded,986,6.893656,0
6375,335,1,0,Montreal,0,Outside,0,Outside,Outside,Outside,418,6.035481,0
6279,333,5,0,Montreal,0,Outside,0,Outside,Outside,Outside,423,6.047372,0
5426,234,49,1,Puebla,0,BRT,0,Windows closed,AC/Heating on,Semi crowded,598,6.393591,0
4643,213,48,1,Mexico City,0,BRT,7,Windows opened,AC/Heating off,Not crowded,983,6.890609,0
5171,222,24,1,Puebla,1,Bus,1,Windows closed,AC/Heating on,Crowded,686,6.530878,0
28,2,13,0,Montreal,1,BRT,3,Windows closed,AC/Heating on,Crowded,1409,7.250636,1


In [84]:
df.groupby(['mode2', 'city'])['co2'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
mode2,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BRT,Mexico City,1456.0,992.660027,564.927532,498.0,737.0,842.0,1015.0,4853.0
BRT,Montreal,1013.0,744.259625,243.546987,464.0,587.0,643.0,827.0,1792.0
BRT,Puebla,888.0,919.461712,229.561502,566.0,760.0,857.0,1036.0,1821.0
Bus,Montreal,817.0,768.332925,203.409093,514.0,647.0,709.0,825.0,2135.0
Bus,Puebla,1485.0,841.461953,167.399016,522.0,737.0,813.0,912.0,1762.0
Outside,Mexico City,300.0,444.703333,8.960448,430.0,438.0,444.0,453.0,460.0
Outside,Montreal,300.0,426.62,5.205695,418.0,422.0,426.5,431.0,435.0
Outside,Puebla,300.0,445.306667,11.415794,428.0,435.75,444.0,455.0,465.0
Subway,Mexico City,212.0,1666.150943,1339.470425,560.0,724.5,927.0,2351.25,4349.0
Subway,Montreal,254.0,575.007874,47.998435,462.0,544.0,570.0,603.0,711.0


In [85]:
df['co2'].describe()

count    7025.000000
mean      823.814093
std       443.936939
min       418.000000
25%       615.000000
50%       753.000000
75%       903.000000
max      4853.000000
Name: co2, dtype: float64

In [86]:
df.groupby(['mode2'])['co2'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
mode2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BRT,3357.0,898.34078,425.821167,464.0,678.0,807.0,978.0,4853.0
Bus,2302.0,815.507819,184.310771,514.0,690.0,783.0,889.0,2135.0
Outside,900.0,438.876667,12.422454,418.0,430.0,436.0,449.0,465.0
Subway,466.0,1071.407725,1054.159663,462.0,566.0,631.0,879.0,4349.0


In [87]:
df2 = df[df.over != 0]

In [88]:
df2.sample(10)

Unnamed: 0,trip,obs,country,city,mode,mode2,route,wo,ac,crowd,co2,lnco2,over
384,25,13,1,Puebla,0,BRT,6,Windows closed,AC/Heating off,Crowded,1265,7.142827,1
5466,240,1,1,Puebla,1,Bus,1,Windows closed,AC/Heating on,Crowded,1023,6.930495,1
222,15,18,1,Puebla,0,BRT,6,Windows opened,AC/Heating off,Crowded,1558,7.351158,1
2854,168,5,1,Puebla,3,Bus,5,Windows opened,AC/Heating off,Crowded,1094,6.997596,1
825,51,30,1,Mexico City,0,BRT,7,Windows opened,AC/Heating off,Semi crowded,1437,7.270313,1
5294,230,3,1,Puebla,0,BRT,0,Windows closed,AC/Heating on,Crowded,1246,7.127694,1
4799,214,16,1,Mexico City,0,BRT,9,Windows opened,AC/Heating off,Not crowded,1029,6.936343,1
3584,191,12,1,Puebla,3,Bus,6,Windows opened,AC/Heating off,Crowded,1081,6.985642,1
1155,70,16,0,Montreal,1,BRT,0,Windows closed,AC/Heating on,Semi crowded,1695,7.435438,1
5515,244,25,1,Puebla,1,Bus,1,Windows closed,AC/Heating on,Crowded,1098,7.001246,1


In [89]:
df2.groupby(['mode2'])['co2'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
mode2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BRT,785.0,1396.588535,632.454568,1001.0,1073.0,1176.0,1418.0,4853.0
Bus,277.0,1183.277978,207.72801,1001.0,1042.0,1119.0,1219.0,2135.0
Subway,100.0,2693.47,1339.249175,1010.0,1216.0,2515.0,4202.25,4349.0


In [90]:
df2.groupby(['mode2', 'city'])['co2'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
mode2,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BRT,Mexico City,386.0,1570.816062,846.887791,1001.0,1074.25,1190.0,1646.0,4853.0
BRT,Montreal,151.0,1234.298013,197.965007,1003.0,1068.0,1174.0,1376.0,1792.0
BRT,Puebla,248.0,1224.225806,185.82604,1002.0,1074.0,1167.0,1333.25,1821.0
Bus,Montreal,81.0,1249.432099,259.662374,1007.0,1083.0,1180.0,1280.0,2135.0
Bus,Puebla,196.0,1155.938776,175.626487,1001.0,1030.75,1095.5,1199.75,1762.0
Subway,Mexico City,100.0,2693.47,1339.249175,1010.0,1216.0,2515.0,4202.25,4349.0


In [95]:
df3 = df[df.mode2 != 'Outside']

In [96]:
df3.sample(10)

Unnamed: 0,trip,obs,country,city,mode,mode2,route,wo,ac,crowd,co2,lnco2,over
377,25,6,0,Montreal,1,BRT,4,Windows closed,AC/Heating on,Not crowded,952,6.858565,0
273,17,14,0,Montreal,1,BRT,3,Windows opened,AC/Heating off,Semi crowded,552,6.313548,0
2488,159,53,1,Puebla,3,Bus,5,Windows opened,AC/Heating off,Not crowded,606,6.40688,0
698,44,14,0,Montreal,1,BRT,0,Windows opened,AC/Heating off,Semi crowded,590,6.380123,0
4172,207,23,1,Mexico City,0,BRT,8,Windows opened,AC/Heating off,Not crowded,627,6.440947,0
3871,201,16,1,Puebla,3,Bus,5,Windows opened,AC/Heating off,Crowded,970,6.877296,0
5318,230,27,1,Puebla,0,BRT,0,Windows closed,AC/Heating on,Semi crowded,779,6.658011,0
4031,204,30,1,Mexico City,0,BRT,7,Windows opened,AC/Heating off,Not crowded,825,6.715383,0
1345,85,4,0,Montreal,1,BRT,0,Windows closed,AC/Heating on,Not crowded,619,6.428105,0
735,46,16,0,Montreal,1,BRT,0,Windows closed,AC/Heating on,Not crowded,773,6.650279,0


In [102]:
df3.groupby(['mode2', 'crowd', 'city'])['co2'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
mode2,crowd,city,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRT,Crowded,Mexico City,219.0,1148.525114,655.423599,540.0,807.5,947.0,1179.0,3656.0
BRT,Crowded,Montreal,102.0,917.127451,306.513052,515.0,669.25,769.0,1169.75,1708.0
BRT,Crowded,Puebla,350.0,1084.991429,250.207667,667.0,881.5,1047.5,1257.75,1821.0
BRT,Not crowded,Mexico City,767.0,881.899609,455.897046,500.0,709.0,779.0,877.0,4853.0
BRT,Not crowded,Montreal,400.0,686.5125,190.555289,464.0,564.75,615.0,746.25,1461.0
BRT,Not crowded,Puebla,336.0,764.369048,90.909439,574.0,697.75,760.0,820.25,1170.0
BRT,Semi crowded,Mexico City,470.0,1100.785106,638.617078,498.0,815.0,938.5,1075.75,4378.0
BRT,Semi crowded,Montreal,511.0,754.956947,248.955713,475.0,598.0,647.0,858.5,1792.0
BRT,Semi crowded,Puebla,202.0,890.628713,146.028164,566.0,775.0,900.0,981.75,1277.0
Bus,Crowded,Montreal,216.0,736.75,93.988136,518.0,673.0,731.0,786.5,996.0


In [101]:
df3.groupby(['mode2', 'ac', 'city'])['co2'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
mode2,ac,city,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRT,AC/Heating off,Mexico City,1238.0,1000.647819,531.732502,498.0,745.0,856.5,1043.75,4853.0
BRT,AC/Heating off,Montreal,330.0,714.848485,214.796351,464.0,581.0,627.0,765.0,1461.0
BRT,AC/Heating off,Puebla,578.0,896.276817,214.702776,610.0,752.0,827.0,959.75,1734.0
BRT,AC/Heating on,Mexico City,218.0,947.298165,725.051918,589.0,713.0,773.0,866.5,4241.0
BRT,AC/Heating on,Montreal,683.0,758.469985,255.222036,475.0,590.0,656.0,855.5,1792.0
BRT,AC/Heating on,Puebla,310.0,962.690323,249.630304,566.0,784.5,932.0,1078.75,1821.0
Bus,AC/Heating off,Montreal,91.0,760.978022,175.093434,514.0,626.0,719.0,806.5,1209.0
Bus,AC/Heating off,Puebla,1253.0,843.970471,167.911152,522.0,745.0,813.0,908.0,1762.0
Bus,AC/Heating on,Montreal,726.0,769.254821,206.773165,514.0,648.0,706.0,826.0,2135.0
Bus,AC/Heating on,Puebla,232.0,827.913793,164.30012,564.0,686.0,822.5,956.75,1307.0


In [100]:
df3.groupby(['mode2', 'wo', 'city'])['co2'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
mode2,wo,city,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRT,Windows closed,Mexico City,218.0,947.298165,725.051918,589.0,713.0,773.0,866.5,4241.0
BRT,Windows closed,Montreal,785.0,775.850955,258.210145,464.0,596.0,668.0,880.0,1792.0
BRT,Windows closed,Puebla,377.0,960.04244,229.566076,566.0,807.0,934.0,1087.0,1821.0
BRT,Windows opened,Mexico City,1238.0,1000.647819,531.732502,498.0,745.0,856.5,1043.75,4853.0
BRT,Windows opened,Montreal,228.0,635.491228,137.230071,478.0,552.75,610.0,656.5,1092.0
BRT,Windows opened,Puebla,511.0,889.522505,225.132882,610.0,740.5,816.0,943.5,1734.0
Bus,Windows closed,Montreal,740.0,778.890541,209.25393,514.0,652.0,722.0,837.0,2135.0
Bus,Windows closed,Puebla,276.0,883.206522,246.499827,564.0,696.75,841.5,984.25,1762.0
Bus,Windows opened,Montreal,77.0,666.87013,83.373817,518.0,613.0,645.0,706.0,990.0
Bus,Windows opened,Puebla,1209.0,831.932175,141.786539,522.0,742.0,810.0,902.0,1639.0
