# Covid Vaccine Data Clean

Used MongoDBCompass to import and establish a connection with covid_vac.csv 

## Notebook Set Up

In [1]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

In [2]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [3]:
# confirm that "admin" database is in MongoDb
print(mongo.list_database_names())

['admin', 'autosaurus', 'classDB', 'config', 'fruits_db', 'local', 'met', 'travel_db', 'uk_food']


In [5]:
# assign the admin database to a variable name
db = mongo['admin']

In [6]:
# review the collections (tables) in our database
print(db.list_collection_names())

['covid_vac', 'system.version']


In [7]:
# assign the collection to a variable
covid_vac = db['covid_vac']

In [26]:
# review a document in the 'covid_vac' collection
# using a specific date as some 'key:value' pairs don't have all the 'keys'
# this will print out the'key;value(s)' pairs in our collection (table)
pprint(db.covid_vac.find_one({"date": "2021-06-27"}))

{'_id': ObjectId('646684e5c50e0eddf0d85b5a'),
 'continent': 'Asia',
 'date': '2021-06-27',
 'gdp_per_capita': 1803.987,
 'location': 'Afghanistan',
 'people_fully_vaccinated': 186260.0,
 'people_fully_vaccinated_per_hundred': 0.45,
 'people_vaccinated': 649434.0,
 'people_vaccinated_per_hundred': 1.58,
 'population': 41128772.0,
 'total_cases': 114220.0,
 'total_cases_per_million': 2777.131}


## Clean The Data

In [48]:
# Retrieve data from the MongoDB collection
results = covid_vac.find()

In [46]:
# Create a Pandas DataFrame
df = pd.DataFrame(results)

In [49]:
# Print the number of rows in the DataFrame
print('rows in DataFrame:', len(df))

rows in DataFrame: 309690


In [47]:
# Display the first 10 rows of the DataFrame
df.head(10)

Unnamed: 0,_id,continent,date,gdp_per_capita,location,population,total_cases_per_million,total_cases,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred
0,646684e5c50e0eddf0d8593d,Asia,2020-01-03,1803.987,Afghanistan,41128772.0,,,,,,
1,646684e5c50e0eddf0d8593e,Asia,2020-01-04,1803.987,Afghanistan,41128772.0,,,,,,
2,646684e5c50e0eddf0d8593f,Asia,2020-01-05,1803.987,Afghanistan,41128772.0,,,,,,
3,646684e5c50e0eddf0d85940,Asia,2020-01-06,1803.987,Afghanistan,41128772.0,,,,,,
4,646684e5c50e0eddf0d85941,Asia,2020-01-07,1803.987,Afghanistan,41128772.0,,,,,,
5,646684e5c50e0eddf0d85942,Asia,2020-01-08,1803.987,Afghanistan,41128772.0,,,,,,
6,646684e5c50e0eddf0d85943,Asia,2020-01-09,1803.987,Afghanistan,41128772.0,,,,,,
7,646684e5c50e0eddf0d85944,Asia,2020-01-10,1803.987,Afghanistan,41128772.0,,,,,,
8,646684e5c50e0eddf0d85945,Asia,2020-01-11,1803.987,Afghanistan,41128772.0,,,,,,
9,646684e5c50e0eddf0d85946,Asia,2020-01-12,1803.987,Afghanistan,41128772.0,,,,,,


In [50]:
# Examine data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309690 entries, 0 to 309689
Data columns (total 12 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   _id                                  309690 non-null  object 
 1   continent                            294967 non-null  object 
 2   date                                 309690 non-null  object 
 3   gdp_per_capita                       239526 non-null  float64
 4   location                             309690 non-null  object 
 5   population                           309690 non-null  float64
 6   total_cases_per_million              273627 non-null  float64
 7   total_cases                          273627 non-null  float64
 8   people_vaccinated                    71627 non-null   float64
 9   people_vaccinated_per_hundred        71627 non-null   float64
 10  people_fully_vaccinated              69395 non-null   float64
 11  people_fully_

In [51]:
# Change data types for data analysis
df['date'] = pd.to_datetime(df['date'])

In [52]:
# Confirm type changes were successful by examining data types again
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309690 entries, 0 to 309689
Data columns (total 12 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   _id                                  309690 non-null  object        
 1   continent                            294967 non-null  object        
 2   date                                 309690 non-null  datetime64[ns]
 3   gdp_per_capita                       239526 non-null  float64       
 4   location                             309690 non-null  object        
 5   population                           309690 non-null  float64       
 6   total_cases_per_million              273627 non-null  float64       
 7   total_cases                          273627 non-null  float64       
 8   people_vaccinated                    71627 non-null   float64       
 9   people_vaccinated_per_hundred        71627 non-null   float64       
 

In [58]:
#Getting rid of missing values (NaN) using drop, inplace=true saves the 'dropped' data
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49465 entries, 494 to 309476
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   _id                                  49465 non-null  object        
 1   continent                            49465 non-null  object        
 2   date                                 49465 non-null  datetime64[ns]
 3   gdp_per_capita                       49465 non-null  float64       
 4   location                             49465 non-null  object        
 5   population                           49465 non-null  float64       
 6   total_cases_per_million              49465 non-null  float64       
 7   total_cases                          49465 non-null  float64       
 8   people_vaccinated                    49465 non-null  float64       
 9   people_vaccinated_per_hundred        49465 non-null  float64       
 10  people_

In [60]:
df.head()


Unnamed: 0,_id,continent,date,gdp_per_capita,location,population,total_cases_per_million,total_cases,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred
494,646684e5c50e0eddf0d85b2b,Asia,2021-05-11,1803.987,Afghanistan,41128772.0,1517.259,62403.0,448878.0,1.09,55624.0,0.14
503,646684e5c50e0eddf0d85b34,Asia,2021-05-20,1803.987,Afghanistan,41128772.0,1570.069,64575.0,470341.0,1.14,77560.0,0.19
507,646684e5c50e0eddf0d85b38,Asia,2021-05-24,1803.987,Afghanistan,41128772.0,1611.402,66275.0,476367.0,1.16,96910.0,0.24
509,646684e5c50e0eddf0d85b3a,Asia,2021-05-26,1803.987,Afghanistan,41128772.0,1647.095,67743.0,479372.0,1.17,111082.0,0.27
510,646684e5c50e0eddf0d85b3b,Asia,2021-05-27,1803.987,Afghanistan,41128772.0,1662.243,68366.0,479574.0,1.17,113739.0,0.28


In [62]:
#Saving clean_df to Resources
clean_df = df
clean_df.to_csv('clean_df.csv',index=False)