<a href="https://colab.research.google.com/github/HowardHNguyen/EDA/blob/main/EDA_Aggregating.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Row Iteration

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import pandas and numpy, and load the covid data
import pandas as pd
pd.set_option('display.width', 60)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
coviddaily = pd.read_csv("/content/drive/MyDrive/_Python/Python-Data-Cleaning/9. Aggregating/data/coviddaily.csv", parse_dates=["casedate"])
ltbrazil = pd.read_csv("/content/drive/MyDrive/_Python/Python-Data-Cleaning/9. Aggregating/data/ltbrazil.csv")

In [3]:
# sort the covid data by location and case date in ascending order
coviddaily = coviddaily.sort_values(['location','casedate'])

In [4]:
# iterate over rows with itertuples, append to list with each change of group
prevloc = 'ZZZ'
rowlist = []
casecnt = 0
for row in coviddaily.itertuples():
  if (prevloc!=row.location):
    if (prevloc!='ZZZ'):
      rowlist.append({'location':prevloc, 'casecnt':casecnt})
    casecnt = 0
    prevloc = row.location
  casecnt += row.new_cases

rowlist.append({'location':prevloc, 'casecnt':casecnt})
len(rowlist)
rowlist[0:4]

[{'location': 'Afghanistan', 'casecnt': 231539.0},
 {'location': 'Albania', 'casecnt': 334863.0},
 {'location': 'Algeria', 'casecnt': 272010.0},
 {'location': 'American Samoa', 'casecnt': 8359.0}]

In [7]:
# create a dataframe from the rowlist
covidtotals = pd.DataFrame(rowlist)
covidtotals.head(6)

Unnamed: 0,location,casecnt
0,Afghanistan,231539
1,Albania,334863
2,Algeria,272010
3,American Samoa,8359
4,Andorra,48015
5,Angola,107084


In [8]:
# sort the land temperatures data and drop rows with missing values for temperature
ltbrazil = ltbrazil.sort_values(['station','month'])
ltbrazil = ltbrazil.dropna(subset=['temperature'])

In [9]:
# iterate over rows with itertuples, append to list with each change of group
prevstation = 'ZZZ'
prevtemp = 0
rowlist = []
tempcnt = 0
stationcnt = 0
for row in ltbrazil.itertuples():
  if (prevstation!=row.station):
    if (prevstation!='ZZZ'):
      rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
    tempcnt = 0
    stationcnt = 0
    prevstation = row.station

  # choose only rows that are within 3 degrees of the previous temperature
  if ((0 <= abs(row.temperature-prevtemp) <= 3) or (stationcnt==0)):
    tempcnt += row.temperature
    stationcnt += 1

  prevtemp = row.temperature

rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
rowlist[0:5]
ltbrazilavgs = pd.DataFrame(rowlist)
ltbrazilavgs.head()

Unnamed: 0,station,avgtemp,stationcnt
0,ALTAMIRA,28,12
1,ALTA_FLORESTA_AERO,32,9
2,ARAXA,22,7
3,BACABAL,29,6
4,BAGE,20,10


## Numpy Iteration

In [10]:
# import pandas and numpy, and load the covid data
import pandas as pd
pd.set_option('display.width', 68)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
coviddaily = pd.read_csv("/content/drive/MyDrive/_Python/Python-Data-Cleaning/9. Aggregating/data/coviddaily.csv", parse_dates=["casedate"])
ltbrazil = pd.read_csv("/content/drive/MyDrive/_Python/Python-Data-Cleaning/9. Aggregating/data/ltbrazil.csv")

In [11]:
# create a list of locations
loclist = coviddaily.location.unique().tolist()

In [12]:
# use a numpy array to calculate sums
rowlist = []
casevalues = coviddaily[['location','new_cases']].to_numpy()
for locitem in loclist:
  cases = [casevalues[j][1] for j in range(len(casevalues))\
    if casevalues[j][0]==locitem]
  rowlist.append(sum(cases))

len(rowlist)
#len(loclist)
#rowlist[0:5]
#casetotals = pd.DataFrame(zip(loclist,rowlist), columns=(['location','casetotals']))
#casetotals.head()

231

In [13]:
len(loclist)

231

In [14]:
rowlist[0:5]

[231539.0, 334863.0, 272010.0, 8359.0, 48015.0]

In [15]:
casetotals = pd.DataFrame(zip(loclist,rowlist), columns=(['location','casetotals']))
casetotals.head()

Unnamed: 0,location,casetotals
0,Afghanistan,231539
1,Albania,334863
2,Algeria,272010
3,American Samoa,8359
4,Andorra,48015


In [16]:
# sort the land temperatures data and drop rows with missing values for temperature
ltbrazil = ltbrazil.sort_values(['station','month'])
ltbrazil = ltbrazil.dropna(subset=['temperature'])

# iterate using numpy arrays
prevstation = 'ZZZ'
prevtemp = 0
rowlist = []
tempvalues = ltbrazil[['station','temperature']].to_numpy()
tempcnt = 0
stationcnt = 0
for j in range(len(tempvalues)):
  station = tempvalues[j][0]
  temperature = tempvalues[j][1]
  if (prevstation!=station):
    if (prevstation!='ZZZ'):
      rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
    tempcnt = 0
    stationcnt = 0
    prevstation = station

  if ((0 <= abs(temperature-prevtemp) <= 3) or (stationcnt==0)):
    tempcnt += temperature
    stationcnt += 1

  prevtemp = temperature

rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
rowlist[0:5]

[{'station': 'ALTAMIRA', 'avgtemp': 27.729166666666668, 'stationcnt': 12},
 {'station': 'ALTA_FLORESTA_AERO',
  'avgtemp': 32.49333333333333,
  'stationcnt': 9},
 {'station': 'ARAXA', 'avgtemp': 21.52142857142857, 'stationcnt': 7},
 {'station': 'BACABAL', 'avgtemp': 28.59166666666667, 'stationcnt': 6},
 {'station': 'BAGE', 'avgtemp': 19.615000000000002, 'stationcnt': 10}]

In [17]:
# create a data frame of land temperature averages
ltbrazilavgs = pd.DataFrame(rowlist)
ltbrazilavgs.head()

Unnamed: 0,station,avgtemp,stationcnt
0,ALTAMIRA,28,12
1,ALTA_FLORESTA_AERO,32,9
2,ARAXA,22,7
3,BACABAL,29,6
4,BAGE,20,10


## Groupby Basics

In [18]:
# import pandas and numpy, and load the covid data
import pandas as pd
pd.set_option('display.width', 68)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.0f}'.format
#coviddaily = pd.read_csv("data/coviddaily.csv", parse_dates=["casedate"])

In [20]:
# create a pandas groupby data frame
countrytots = coviddaily.groupby(['location'])
#type(countrytots)

In [22]:
# create data frames for the first and last rows for each country
countrytots.first().iloc[0:5, 0:5]

#type(countrytots.last())

Unnamed: 0_level_0,iso_code,casedate,continent,new_cases,new_deaths
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,AFG,2020-03-01,Asia,1,0
Albania,ALB,2020-03-15,Europe,33,1
Algeria,DZA,2020-03-01,Africa,1,0
American Samoa,ASM,2021-09-19,Oceania,1,0
Andorra,AND,2020-03-08,Europe,1,0


In [23]:
countrytots.last().iloc[0:5, 0:5]

Unnamed: 0_level_0,iso_code,casedate,continent,new_cases,new_deaths
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,AFG,2024-02-04,Asia,210,0
Albania,ALB,2024-01-28,Europe,45,0
Algeria,DZA,2023-12-03,Africa,19,0
American Samoa,ASM,2023-09-17,Oceania,18,0
Andorra,AND,2023-05-07,Europe,41,0


In [24]:
# get all of the rows for a country
countrytots.get_group(('Zimbabwe')).iloc[0:5, 0:5]

  countrytots.get_group(('Zimbabwe')).iloc[0:5, 0:5]


Unnamed: 0,iso_code,casedate,location,continent,new_cases
36305,ZWE,2020-03-22,Zimbabwe,Africa,2
36306,ZWE,2020-03-29,Zimbabwe,Africa,5
36307,ZWE,2020-04-05,Zimbabwe,Africa,2
36308,ZWE,2020-04-12,Zimbabwe,Africa,7
36309,ZWE,2020-04-19,Zimbabwe,Africa,10


In [25]:
# loop through the groups
for name, group in countrytots:
  if (name[0] in ['Malta','Kuwait']):
    print(group.iloc[0:5, 0:5])

      iso_code   casedate location continent  new_cases
17818      KWT 2020-03-01   Kuwait      Asia         45
17819      KWT 2020-03-08   Kuwait      Asia         16
17820      KWT 2020-03-15   Kuwait      Asia         43
17821      KWT 2020-03-22   Kuwait      Asia         72
17822      KWT 2020-03-29   Kuwait      Asia         59
      iso_code   casedate location continent  new_cases
20621      MLT 2020-03-08    Malta    Europe          3
20622      MLT 2020-03-15    Malta    Europe         28
20623      MLT 2020-03-22    Malta    Europe         78
20624      MLT 2020-03-29    Malta    Europe         50
20625      MLT 2020-04-05    Malta    Europe         79


In [26]:
# show the number of rows for each country
countrytots.size()

Unnamed: 0_level_0,0
location,Unnamed: 1_level_1
Afghanistan,205
Albania,175
Algeria,189
American Samoa,58
Andorra,158
...,...
Vietnam,192
Wallis and Futuna,23
Yemen,122
Zambia,173


In [27]:
# show summary statistics by country
countrytots.new_cases.describe().head(3).T
#countrytots.new_cases.sum().head()

location,Afghanistan,Albania,Algeria
count,205,175,189
mean,1129,1914,1439
std,1957,2637,2205
min,1,20,1
25%,242,113,30
50%,432,522,723
75%,1106,3280,1754
max,12314,15405,14774


In [28]:
# show summary statistics by country
countrytots.new_cases.sum().head()

Unnamed: 0_level_0,new_cases
location,Unnamed: 1_level_1
Afghanistan,231539
Albania,334863
Algeria,272010
American Samoa,8359
Andorra,48015


## Groupby More

In [29]:
# import pandas, load the nls97 feather file
import pandas as pd
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 9)
pd.set_option('display.max_rows', 30)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("/content/drive/MyDrive/_Python/Python-Data-Cleaning/9. Aggregating/data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

In [32]:
# review the structure of the nls97 data
nls97.iloc[:,0:12].info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 135335 to 713757
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8984 non-null   object 
 1   birthmonth             8984 non-null   int64  
 2   birthyear              8984 non-null   int64  
 3   sampletype             8984 non-null   object 
 4   ethnicity              8984 non-null   object 
 5   highestgradecompleted  6663 non-null   float64
 6   maritalstatus          6675 non-null   object 
 7   childathome            4791 non-null   float64
 8   childnotathome         4791 non-null   float64
 9   weeklyhrscomputer      5792 non-null   object 
 10  weeklyhrstv            6711 non-null   object 
 11  nightlyhrssleep        6706 non-null   float64
dtypes: float64(4), int64(2), object(6)
memory usage: 912.4+ KB


In [33]:
# look again at some of the data
catvars = ['gender','maritalstatus','highestdegree']

for col in catvars:
  print(nls97[col].value_counts().sort_index(), sep="\n\n", end="\n\n\n")

gender
Female    4385
Male      4599
Name: count, dtype: int64


maritalstatus
Divorced          669
Married          3068
Never-married    2767
Separated         148
Widowed            23
Name: count, dtype: int64


highestdegree
0. None             877
1. GED             1167
2. High School     3531
3. Associates       766
4. Bachelors       1713
5. Masters          704
6. PhD               64
7. Professional     130
Name: count, dtype: int64




In [34]:
# review some descriptive statistics
contvars = ['satmath','satverbal','weeksworked06','gpaoverall','childathome']

nls97[contvars].describe()

Unnamed: 0,satmath,satverbal,weeksworked06,gpaoverall,childathome
count,1407,1406,8419,6004,4791
mean,501,500,38,282,2
std,115,112,19,62,1
min,7,14,0,10,0
25%,430,430,27,243,1
50%,500,500,51,286,2
75%,580,570,52,326,3
max,800,800,52,417,9


In [35]:
# look at sat math scores by gender
nls97.groupby('gender')['satmath'].mean()

Unnamed: 0_level_0,satmath
gender,Unnamed: 1_level_1
Female,487
Male,517


In [36]:
# look at sat math scores by gender and highest degree earned
nls97.groupby(['gender','highestdegree'])['satmath'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,satmath
gender,highestdegree,Unnamed: 2_level_1
Female,0. None,414
Female,1. GED,405
Female,2. High School,426
Female,3. Associates,448
Female,4. Bachelors,503
Female,5. Masters,504
Female,6. PhD,569
Female,7. Professional,593
Male,0. None,545
Male,1. GED,320


In [37]:
# look at sat math and verbal scores by gender and highest degree earned
nls97.groupby(['gender','highestdegree'])[['satmath','satverbal']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,satmath,satverbal
gender,highestdegree,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,0. None,414,408
Female,1. GED,405,390
Female,2. High School,426,440
Female,3. Associates,448,453
Female,4. Bachelors,503,508
Female,5. Masters,504,529
Female,6. PhD,569,561
Female,7. Professional,593,584
Male,0. None,545,515
Male,1. GED,320,360


In [38]:
# add max and standard deviations
nls97.groupby(['gender','highestdegree'])['gpaoverall'].agg(['count','mean','max','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max,std
gender,highestdegree,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0. None,134,243,400,66
Female,1. GED,231,230,391,66
Female,2. High School,1152,277,402,53
Female,3. Associates,294,291,400,50
Female,4. Bachelors,742,322,407,48
Female,5. Masters,364,329,417,43
Female,6. PhD,26,345,400,44
Female,7. Professional,55,353,411,41
Male,0. None,180,222,400,65
Male,1. GED,346,223,380,63


In [42]:
# use a dictionary for more complicated aggregations
pd.options.display.float_format = '{:,.1f}'.format
aggdict = {'weeksworked06':['count', 'mean','max','std'], 'childathome':['count', 'mean','max', 'std']}
nls97.groupby(['highestdegree']).agg(aggdict)
#nls97.groupby(['maritalstatus']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,max,std,count,mean,max,std
highestdegree,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0. None,666,29.7,52.0,21.6,408,1.8,8.0,1.6
1. GED,1129,32.9,52.0,20.7,702,1.7,9.0,1.5
2. High School,3262,39.4,52.0,18.6,1881,1.9,7.0,1.3
3. Associates,755,40.2,52.0,18.0,448,1.9,6.0,1.1
4. Bachelors,1683,42.3,52.0,16.2,859,1.9,8.0,1.1
5. Masters,703,41.8,52.0,16.6,379,1.9,6.0,0.9
6. PhD,63,38.5,52.0,18.4,33,1.9,3.0,0.8
7. Professional,127,27.8,52.0,20.4,60,1.8,4.0,0.8


In [40]:
nls97.groupby(['maritalstatus']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,max,std,count,mean,max,std
maritalstatus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Divorced,666,37.5,52.0,19.0,530,1.5,5.0,1.2
Married,3035,40.3,52.0,17.9,2565,2.1,8.0,1.1
Never-married,2735,37.2,52.0,19.1,1501,1.6,9.0,1.3
Separated,147,33.6,52.0,20.3,132,1.5,8.0,1.4
Widowed,23,37.1,52.0,19.3,18,1.8,5.0,1.4


## Groupby UDF

In [43]:
# import pandas and numpy, and load the nls data
import pandas as pd
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.1f}'.format
nls97 = pd.read_csv("/content/drive/MyDrive/_Python/Python-Data-Cleaning/9. Aggregating/data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

In [44]:
# create a function for calculating interquartile range
def iqr(x):
  return x.quantile(0.75) - x.quantile(0.25)

# run the interquartile range function
aggdict = {'weeksworked06':['count', 'mean', iqr], 'childathome':['count', 'mean', iqr]}
nls97.groupby(['highestdegree']).agg(aggdict)

Unnamed: 0_level_0,weeksworked06,weeksworked06,weeksworked06,childathome,childathome,childathome
Unnamed: 0_level_1,count,mean,iqr,count,mean,iqr
highestdegree,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0. None,666,29.7,47.0,408,1.8,3.0
1. GED,1129,32.9,40.0,702,1.7,3.0
2. High School,3262,39.4,21.0,1881,1.9,2.0
3. Associates,755,40.2,19.0,448,1.9,2.0
4. Bachelors,1683,42.3,13.5,859,1.9,1.0
5. Masters,703,41.8,13.5,379,1.9,1.0
6. PhD,63,38.5,22.0,33,1.9,2.0
7. Professional,127,27.8,43.0,60,1.8,1.0


In [45]:
# define a function to return the summary statistics as a series
def gettots(x):
  out = {}
  out['qr1'] = x.quantile(0.25)
  out['med'] = x.median()
  out['qr3'] = x.quantile(0.75)
  out['count'] = x.count()
  return out

In [46]:
# use apply to run the function
pd.options.display.float_format = '{:,.0f}'.format
nls97.groupby(['highestdegree'])['weeksworked06'].apply(gettots)

Unnamed: 0_level_0,Unnamed: 1_level_0,weeksworked06
highestdegree,Unnamed: 1_level_1,Unnamed: 2_level_1
0. None,qr1,5
0. None,med,35
0. None,qr3,52
0. None,count,666
1. GED,qr1,12
1. GED,med,42
1. GED,qr3,52
1. GED,count,1129
2. High School,qr1,31
2. High School,med,52


In [47]:
# chain reset_index to set the default index
nls97.groupby(['highestdegree'])['weeksworked06'].\
  apply(gettots).reset_index()

Unnamed: 0,highestdegree,level_1,weeksworked06
0,0. None,qr1,5
1,0. None,med,35
2,0. None,qr3,52
3,0. None,count,666
4,1. GED,qr1,12
5,1. GED,med,42
6,1. GED,qr3,52
7,1. GED,count,1129
8,2. High School,qr1,31
9,2. High School,med,52


In [49]:
# allow the index to be created
nlssums = nls97.groupby(['highestdegree'])['weeksworked06'].apply(gettots).unstack()
nlssums
#nlssums.info()

Unnamed: 0_level_0,qr1,med,qr3,count
highestdegree,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0. None,5,35,52,666
1. GED,12,42,52,1129
2. High School,31,52,52,3262
3. Associates,33,52,52,755
4. Bachelors,38,52,52,1683
5. Masters,38,52,52,703
6. PhD,30,50,52,63
7. Professional,6,30,49,127


In [50]:
nlssums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 0. None to 7. Professional
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   qr1     8 non-null      float64
 1   med     8 non-null      float64
 2   qr3     8 non-null      float64
 3   count   8 non-null      float64
dtypes: float64(4)
memory usage: 320.0+ bytes


## Groupby to dataframe

In [51]:
# import pandas and load the covid data and land temperature data
import pandas as pd
pd.set_option('display.width', 62)
pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.0f}'.format
#coviddaily = pd.read_csv("data/coviddaily.csv", parse_dates=["casedate"])
#ltbrazil = pd.read_csv("data/ltbrazil.csv")

In [53]:
coviddaily[['location','casedate','new_cases','new_deaths']].set_index(['location','casedate']).sample(10, random_state=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,new_cases,new_deaths
location,casedate,Unnamed: 2_level_1,Unnamed: 3_level_1
Andorra,2020-03-15,1,0
Portugal,2022-12-04,3963,69
Eswatini,2022-08-07,22,2
Singapore,2020-08-30,451,0
Georgia,2020-08-02,46,1
British Virgin Islands,2020-08-30,14,0
Thailand,2023-01-29,472,29
Bolivia,2023-12-17,280,0
Montenegro,2021-08-15,2560,9
Eswatini,2022-04-17,132,0


In [55]:
# convert covid data from one country per day to summary values across all countries per day
coviddailytotals = coviddaily.loc[coviddaily.casedate.between('2023-02-01','2024-01-31')].groupby(['casedate'], as_index=False)[['new_cases','new_deaths']].sum()

coviddailytotals.head(10)

Unnamed: 0,casedate,new_cases,new_deaths
0,2023-02-05,1385583,69679
1,2023-02-12,1247389,10105
2,2023-02-19,1145666,8539
3,2023-02-26,1072712,7771
4,2023-03-05,1028278,7001
5,2023-03-12,894678,6340
6,2023-03-19,879074,6623
7,2023-03-26,833043,6711
8,2023-04-02,799453,5969
9,2023-04-09,701000,5538


In [57]:
# create a data frame with average temperatures from each station in Brazil
ltbrazil.head(2).T
#ltbrazil = ltbrazil.dropna(subset=['temperature'])
#ltbrazilavgs = ltbrazil.groupby(['station'],as_index=False).agg({'latabs':'first','elevation':'first','temperature':'mean'})
#ltbrazilavgs.head(10)

Unnamed: 0,4,88
locationid,BR000352000,BR000352000
year,2023,2023
month,1,2
temperature,26,26
latitude,-3,-3
longitude,-52,-52
elevation,112,112
station,ALTAMIRA,ALTAMIRA
countryid,BR,BR
country,Brazil,Brazil


In [61]:
ltbrazil = ltbrazil.dropna(subset=['temperature'])
ltbrazilavgs = ltbrazil.groupby(['station'],as_index=False).agg({'latabs':'first','elevation':'first','temperature':'mean'})
ltbrazilavgs.head(10)

Unnamed: 0,station,latabs,elevation,temperature
0,ALTAMIRA,3,112,28
1,ALTA_FLORESTA_AERO,10,289,32
2,ARAXA,20,1004,22
3,BACABAL,4,25,29
4,BAGE,31,242,20
5,BARRA_DO_CORDA,6,153,28
6,BARREIRAS,12,439,27
7,BARTOLOMEU_LISANDRO,22,17,26
8,BAURU,22,617,25
9,BELEM,1,10,28


## Pivottable to dataframe

In [62]:
# import pandas and load the covid data and land temperature data
import pandas as pd
pd.set_option('display.width', 72)
pd.set_option('display.max_columns', 7 )
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

In [63]:
coviddailytotals = pd.pivot_table(coviddaily.loc[coviddaily.casedate.between('2023-02-01','2024-01-31')],
  values=['new_cases','new_deaths'], index='casedate', aggfunc='sum')

coviddailytotals.head(10)

Unnamed: 0_level_0,new_cases,new_deaths
casedate,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-02-05,1385583,69679
2023-02-12,1247389,10105
2023-02-19,1145666,8539
2023-02-26,1072712,7771
2023-03-05,1028278,7001
2023-03-12,894678,6340
2023-03-19,879074,6623
2023-03-26,833043,6711
2023-04-02,799453,5969
2023-04-09,701000,5538


In [64]:
# create a data frame with average temperatures from each station in Brazil
ltbrazil = ltbrazil.dropna(subset=['temperature'])

ltbrazilavgs = pd.pivot_table(ltbrazil, index=['station'],
  aggfunc={'latabs':'first','elevation':'first','temperature':'mean'})

ltbrazilavgs.head(10)

Unnamed: 0_level_0,elevation,latabs,temperature
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALTAMIRA,112,3,28
ALTA_FLORESTA_AERO,289,10,32
ARAXA,1004,20,22
BACABAL,25,4,29
BAGE,242,31,20
BARRA_DO_CORDA,153,6,28
BARREIRAS,439,12,27
BARTOLOMEU_LISANDRO,17,22,26
BAURU,617,22,25
BELEM,10,1,28
