Import the dataset with `mongoimport --type csv -d covid_vaccines  -c vaccine_records --headerline --drop us_state_vaccinations.csv`

In [1]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

In [2]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [3]:
# confirm that our new database was created
print(mongo.list_database_names())

['admin', 'config', 'covid_vaccines', 'epa', 'fruits_db', 'gardenDB', 'local', 'met', 'petsitly_marketing', 'uk_food']


In [4]:
# assign the covid vaccines database to a variable name
db = mongo['covid_vaccines']

In [58]:
# review the collections in our new database
print(db.list_collection_names())

['vaccine_records']


In [59]:
# review a document in the vaccine records collection
pprint(db.vaccine_records.find_one())

{'_id': ObjectId('6581016133468c90a2cdef26'),
 'daily_vaccinations': 5906.0,
 'daily_vaccinations_per_million': 1205.0,
 'daily_vaccinations_raw': 5906.0,
 'date': '2021-01-13',
 'distributed_per_hundred': 7.73,
 'location': 'Alabama',
 'people_fully_vaccinated': 9245.0,
 'people_fully_vaccinated_per_hundred': 0.19,
 'people_vaccinated': 74792.0,
 'people_vaccinated_per_hundred': 1.53,
 'share_doses_used': 0.222,
 'total_boosters': '',
 'total_boosters_per_hundred': '',
 'total_distributed': 378975.0,
 'total_vaccinations': 84040.0,
 'total_vaccinations_per_hundred': 1.71}


In [60]:
# assign the collection to a variable
vaccine_records = db['vaccine_records']

In [61]:
# Filter records down to date range
query = {'date': {'$gte': '2021-01-01',
                  '$lte': '2022-01-01'
                  }}
results = vaccine_records.find(query)

# Use count_documents to display the number of documents in the result
print('Number of records between Jan 1, 2021 and Jan 1, 2022:' , vaccine_records.count_documents(query))

# Display the first document in the results using pprint
pprint(results[0])

Number of records between Jan 1, 2021 and Jan 1, 2022: 23000
{'_id': ObjectId('6581016133468c90a2cdef26'),
 'daily_vaccinations': 5906.0,
 'daily_vaccinations_per_million': 1205.0,
 'daily_vaccinations_raw': 5906.0,
 'date': '2021-01-13',
 'distributed_per_hundred': 7.73,
 'location': 'Alabama',
 'people_fully_vaccinated': 9245.0,
 'people_fully_vaccinated_per_hundred': 0.19,
 'people_vaccinated': 74792.0,
 'people_vaccinated_per_hundred': 1.53,
 'share_doses_used': 0.222,
 'total_boosters': '',
 'total_boosters_per_hundred': '',
 'total_distributed': 378975.0,
 'total_vaccinations': 84040.0,
 'total_vaccinations_per_hundred': 1.71}


In [62]:
# Convert the result to a Pandas DataFrame
df = pd.DataFrame(results)

# Display the number of rows in the DataFrame
print('Number of rows:', len(df))

# Display the first 10 rows of the DataFrame
df.head(10)

Number of rows: 23000


Unnamed: 0,_id,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,total_boosters,total_boosters_per_hundred
0,6581016133468c90a2cdef26,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.53,7.73,5906.0,5906.0,1205.0,0.222,,
1,6581016133468c90a2cdef27,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.45,7.69,,,,0.207,,
2,6581016133468c90a2cdef28,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.212,,
3,6581016133468c90a2cdef29,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.28,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.226,,
4,6581016133468c90a2cdef2a,2021-01-16,Alabama,,,,,,,,,,7498.0,1529.0,,,
5,6581016133468c90a2cdef2b,2021-01-17,Alabama,,,,,,,,,,7509.0,1531.0,,,
6,6581016133468c90a2cdef2c,2021-01-18,Alabama,,,,,,,,,,7517.0,1533.0,,,
7,6581016133468c90a2cdef2d,2021-01-20,Alabama,139200.0,483275.0,121113.0,0.37,2.84,17956.0,2.47,9.86,8405.0,7880.0,1607.0,0.288,,
8,6581016133468c90a2cdef2e,2021-01-21,Alabama,165919.0,493125.0,144429.0,0.44,3.38,21345.0,2.95,10.06,26719.0,10517.0,2145.0,0.336,,
9,6581016133468c90a2cdef2f,2021-01-19,Alabama,130795.0,444650.0,114319.0,0.33,2.67,16346.0,2.33,9.07,,7523.0,1534.0,0.294,,


In [63]:
# Sorting the DataFrame based on date
vaccine_df = df.sort_values('date')
vaccine_df.head()

Unnamed: 0,_id,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used,total_boosters,total_boosters_per_hundred
19441,6581016233468c90a2cea355,2021-01-01,United States,,,,,,,,,,302329.0,911.0,,,
19439,6581016233468c90a2cea353,2021-01-02,United States,4225756.0,,,,1.27,,,,,325882.0,982.0,,,
19440,6581016233468c90a2cea354,2021-01-03,United States,,,,,,,,,,336949.0,1015.0,,,
19442,6581016233468c90a2cea356,2021-01-04,United States,4563260.0,,,,1.37,,,,,348017.0,1048.0,,,
19443,6581016233468c90a2cea357,2021-01-05,United States,4836469.0,,,,1.46,,,,273209.0,339372.0,1022.0,,,


Import the dataset with `mongoimport --type csv -d covid_vaccines_byAge  -c records_byAge --headerline --drop COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_State_Timeseries__RAW_.csv`

In [28]:
# Delete the database
# mongo.drop_database('covid_vaccines_byAge')
# mongo.list_database_names()

['admin',
 'config',
 'covid_vaccines',
 'epa',
 'fruits_db',
 'gardenDB',
 'local',
 'met',
 'petsitly_marketing',
 'uk_food']

In [29]:
# confirm that our new database was created
print(mongo.list_database_names())

['admin', 'config', 'covid_vaccines', 'covid_vaccines_byAge', 'epa', 'fruits_db', 'gardenDB', 'local', 'met', 'petsitly_marketing', 'uk_food']


In [30]:
# assign the covid vaccines byAge database to a variable name
db_byAge = mongo['covid_vaccines_byAge']

In [31]:
# review the collections in our new database
print(db_byAge.list_collection_names())

['records_byAge']


In [44]:
# review a document in the vaccine records collection
pprint(db_byAge.records_byAge.find_one())

{'_id': ObjectId('658309458e6477eaa756d658'),
 'adult_icu_bed_covid_utilization': 0.086206896551724,
 'adult_icu_bed_covid_utilization_coverage': 15,
 'adult_icu_bed_covid_utilization_denominator': 174,
 'adult_icu_bed_covid_utilization_numerator': 15,
 'adult_icu_bed_utilization': 0.741758241758242,
 'adult_icu_bed_utilization_coverage': 16,
 'adult_icu_bed_utilization_denominator': 182,
 'adult_icu_bed_utilization_numerator': 135,
 'all_pediatric_inpatient_bed_occupied': 337,
 'all_pediatric_inpatient_bed_occupied_coverage': 14,
 'all_pediatric_inpatient_beds': 572,
 'all_pediatric_inpatient_beds_coverage': 14,
 'critical_staffing_shortage_anticipated_within_week_no': 13,
 'critical_staffing_shortage_anticipated_within_week_not_reported': 2,
 'critical_staffing_shortage_anticipated_within_week_yes': 1,
 'critical_staffing_shortage_today_no': 13,
 'critical_staffing_shortage_today_not_reported': 2,
 'critical_staffing_shortage_today_yes': 1,
 'date': '2021/05/09',
 'deaths_covid': 1,


In [37]:
# assign the collection to a variable
records_byAge = db_byAge['records_byAge']

In [53]:
# Filter records down to date range
query = {'date': {'$gte': '2021/01/01',
                  '$lte': '2022/01/01'
                  }}
results_byAge = records_byAge.find(query)

# Use count_documents to display the number of documents in the result
print('Number of records between Jan 1, 2021 and Jan 1, 2022:' , records_byAge.count_documents(query))

# Display the first document in the results using pprint
pprint(results_byAge[0])

Number of records between Jan 1, 2021 and Jan 1, 2022: 19529
{'_id': ObjectId('658309458e6477eaa756d658'),
 'adult_icu_bed_covid_utilization': 0.086206896551724,
 'adult_icu_bed_covid_utilization_coverage': 15,
 'adult_icu_bed_covid_utilization_denominator': 174,
 'adult_icu_bed_covid_utilization_numerator': 15,
 'adult_icu_bed_utilization': 0.741758241758242,
 'adult_icu_bed_utilization_coverage': 16,
 'adult_icu_bed_utilization_denominator': 182,
 'adult_icu_bed_utilization_numerator': 135,
 'all_pediatric_inpatient_bed_occupied': 337,
 'all_pediatric_inpatient_bed_occupied_coverage': 14,
 'all_pediatric_inpatient_beds': 572,
 'all_pediatric_inpatient_beds_coverage': 14,
 'critical_staffing_shortage_anticipated_within_week_no': 13,
 'critical_staffing_shortage_anticipated_within_week_not_reported': 2,
 'critical_staffing_shortage_anticipated_within_week_yes': 1,
 'critical_staffing_shortage_today_no': 13,
 'critical_staffing_shortage_today_not_reported': 2,
 'critical_staffing_shorta

In [54]:
# Convert the result to a Pandas DataFrame
age_df = pd.DataFrame(results_byAge)

# Display the number of rows in the DataFrame
print('Number of rows:', len(age_df))

# Display the first 10 rows of the DataFrame
age_df.head(10)

Number of rows: 19529


Unnamed: 0,_id,state,date,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,...,previous_day_admission_pediatric_covid_confirmed_5_11,previous_day_admission_pediatric_covid_confirmed_5_11_coverage,previous_day_admission_pediatric_covid_confirmed_unknown,previous_day_admission_pediatric_covid_confirmed_unknown_coverage,staffed_icu_pediatric_patients_confirmed_covid,staffed_icu_pediatric_patients_confirmed_covid_coverage,staffed_pediatric_icu_bed_occupancy,staffed_pediatric_icu_bed_occupancy_coverage,total_staffed_pediatric_icu_beds,total_staffed_pediatric_icu_beds_coverage
0,658309458e6477eaa756d658,DE,2021/05/09,1,13,2,1,13,2,1,...,,0,,0,,0,105,14,172,14
1,658309458e6477eaa756d659,NV,2021/05/10,0,1,59,0,1,59,25,...,,0,,0,,0,0,1,0,1
2,658309458e6477eaa756d65a,RI,2021/05/09,5,9,1,5,9,1,19,...,,0,,0,,0,83,14,108,14
3,658309458e6477eaa756d65b,AK,2021/05/07,1,23,0,1,23,0,0,...,,0,,0,,0,58,24,73,24
4,658309458e6477eaa756d65c,CO,2021/05/08,5,87,13,7,85,13,17,...,,0,,0,,0,169,99,402,99
5,658309458e6477eaa756d65d,NE,2021/05/07,9,91,1,15,85,1,0,...,,0,,0,,0,0,12,0,12
6,658309458e6477eaa756d65e,HI,2021/05/07,1,21,4,2,20,4,1,...,,0,,0,,0,1,18,0,18
7,658309458e6477eaa756d65f,NV,2021/05/06,0,4,59,0,4,59,25,...,,0,,0,,0,0,3,0,3
8,658309458e6477eaa756d660,WY,2021/05/07,3,26,2,4,25,2,1,...,,0,,0,,0,0,7,0,7
9,658309458e6477eaa756d661,KY,2021/05/03,1,10,92,4,97,2,3,...,,0,,0,,0,0,11,1,11


In [55]:
# Sorting the DataFrame based on date
df_byAge = age_df.sort_values('date')
df_byAge.head()

Unnamed: 0,_id,state,date,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,...,previous_day_admission_pediatric_covid_confirmed_5_11,previous_day_admission_pediatric_covid_confirmed_5_11_coverage,previous_day_admission_pediatric_covid_confirmed_unknown,previous_day_admission_pediatric_covid_confirmed_unknown_coverage,staffed_icu_pediatric_patients_confirmed_covid,staffed_icu_pediatric_patients_confirmed_covid_coverage,staffed_pediatric_icu_bed_occupancy,staffed_pediatric_icu_bed_occupancy_coverage,total_staffed_pediatric_icu_beds,total_staffed_pediatric_icu_beds_coverage
4879,658309468e6477eaa756fa5d,VI,2021/01/01,1,1,0,2,0,0,0,...,,0,,0,,0,,0,,0
5629,658309468e6477eaa756ff9a,NV,2021/01/01,0,5,58,0,5,58,39,...,,0,,0,,0,0.0,5,0.0,5
5689,658309468e6477eaa756fff9,PR,2021/01/01,0,2,65,0,2,65,5,...,,0,,0,,0,2.0,2,15.0,2
1365,658309468e6477eaa756e252,AK,2021/01/01,1,23,0,1,23,0,1,...,,0,,0,0.0,4,50.0,24,66.0,24
2295,658309468e6477eaa756e8bb,MO,2021/01/01,23,98,19,32,89,19,33,...,,0,,0,,0,612.0,136,961.0,136
