#### Introduction
The project focuses on building an ETL (Extract, Transform, Load) pipeline using Python. It extracts data from a CSV file, performs data transformations using Pandas, and prepares it for loading into a PostgreSQL database.

In [82]:
#import libraries
import pandas as pd # For data Extract/transformation/manipulation/wrangling/analysis, etc
import psycopg2 # For Connecting Python to Postgresql database
from sqlalchemy import create_engine # To efficiently manage and reuse the database connections

### Step 1: Extract data from the .csv file into a Pandas Dataframe1

In [83]:
#read csv file
covid_data = pd.read_csv('worldometer_data.csv')
covid_data.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


### Step 2: Transform the data(i.e clean the data)

In [84]:
# check duplicates (country names)
covid_data.duplicated(subset=['Country/Region']).sum()

np.int64(0)

In [85]:
covid_data.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region'],
      dtype='object')

In [86]:
covid_data.shape

(209, 16)

In [87]:
#remove colums
col = ['Tot Cases/1M pop', 'Deaths/1M pop','Tests/1M pop','Serious,Critical','WHO Region']
covid_data = covid_data.drop(columns=col)

In [88]:
covid_data.shape

(209, 11)

##### deal with  missing data

In [89]:
#check missing data
covid_data.isnull().sum()

Country/Region      0
Continent           1
Population          1
TotalCases          0
NewCases          205
TotalDeaths        21
NewDeaths         206
TotalRecovered      4
NewRecovered      206
ActiveCases         4
TotalTests         18
dtype: int64

In [90]:
covid_data.isnull().all()

Country/Region    False
Continent         False
Population        False
TotalCases        False
NewCases          False
TotalDeaths       False
NewDeaths         False
TotalRecovered    False
NewRecovered      False
ActiveCases       False
TotalTests        False
dtype: bool

In [91]:
# Drop Columns with Too Many Missing Values
# remove NewCases ,NewDeaths,NewRecovered
remove_col = ['NewCases','NewDeaths', 'NewRecovered']
covid_data = covid_data.drop(columns=remove_col)

In [92]:
covid_data.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases',
       'TotalDeaths', 'TotalRecovered', 'ActiveCases', 'TotalTests'],
      dtype='object')

In [93]:
# fill the values 
# Median is useful when data is skewed or has outliers.
# Mean is suitable when the data is more evenly distributed without extreme outliers.
covid_data['ActiveCases']=covid_data['ActiveCases'].fillna(covid_data['ActiveCases'].median())
covid_data['TotalTests']=covid_data['TotalTests'].fillna(covid_data['TotalTests'].median())
covid_data['TotalRecovered']=covid_data['TotalRecovered'].fillna(covid_data['TotalRecovered'].median())
covid_data['TotalDeaths']=covid_data['TotalDeaths'].fillna(covid_data['TotalDeaths'].mean())

In [94]:
#remove sepesific row
covid_data.dropna(subset=["Population"], inplace=True)


In [95]:
covid_data.isnull().sum()

Country/Region    0
Continent         0
Population        0
TotalCases        0
TotalDeaths       0
TotalRecovered    0
ActiveCases       0
TotalTests        0
dtype: int64

In [99]:
# group by using continent
covid_data=covid_data = covid_data.groupby('Continent', group_keys=False).apply(lambda x: x.sort_values('Country/Region')).reset_index(drop=True)
'''
groupby('Continent', group_keys=False):

groupby('Continent'): Groups the DataFrame by the 'Continent' column. This means the rows are split into different groups, each containing data for a single continent.
group_keys=False: Ensures that the grouping column ('Continent') is not included in the result as part of a multi-level index. It keeps the output cleaner.
apply(lambda x: x.sort_values('Country/Region')):

apply(): Applies a function (in this case, a lambda function) to each group.
lambda x: x.sort_values('Country/Region'): Defines the function that sorts the rows of each group (continent) by the 'Country/Region' column. This ensures that countries are arranged alphabetically within each continent.
reset_index(drop=True):

reset_index(): Resets the index of the resulting DataFrame.
drop=True: Drops the old index and doesn't add it as a new column. This ensures that the final DataFrame has a fresh, sequential index starting from 0.'''

  covid_data=covid_data = covid_data.groupby('Continent', group_keys=False).apply(lambda x: x.sort_values('Country/Region')).reset_index(drop=True)


In [98]:
covid_data

Unnamed: 0,Country/Region,Continent,Population,TotalCases,TotalDeaths,TotalRecovered,ActiveCases,TotalTests
0,Algeria,Africa,43926079.0,33626,1273.0,23238.0,9115.0,135702.0
1,Angola,Africa,32956300.0,1483,64.0,520.0,899.0,64747.0
2,Benin,Africa,12151976.0,1936,38.0,1600.0,298.0,93677.0
3,Botswana,Africa,2356075.0,804,2.0,63.0,739.0,68423.0
4,Burkina Faso,Africa,20954852.0,1158,54.0,961.0,143.0,135702.0
...,...,...,...,...,...,...,...,...
203,Paraguay,South America,7141091.0,6375,66.0,4974.0,1335.0,135277.0
204,Peru,South America,33016319.0,455409,20424.0,310337.0,124648.0,2493429.0
205,Suriname,South America,587154.0,2096,29.0,1446.0,621.0,2785.0
206,Uruguay,South America,3474956.0,1318,37.0,1079.0,202.0,126956.0


In [100]:
#save origial in csv file
covid_data.to_csv("worldometer_data.csv", index=False)

### Step 3: Create a database
  go to PGAdmin 4 and create database tables

### Step 4: Load the clean data into the database

In [101]:
# Database credentials
username = "postgres"
password = "mashi123"
host = "localhost"
port = "5432"
db_name = "Covid19"

In [102]:
# Establish a connection
engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db_name}')
try:
    with engine.connect():
        print("Connection successful!")
except Exception as e:
    print(f"Connection failed: {e}")


Connection successful!


In [None]:
# load the database table - Employee
covid_data.to_sql('worldometer_data', engine, if_exists='replace', index=False)

#close the connection
engine.dispose()

#### Summary
##### Objective: To process and analyze COVID-19 data by implementing an ETL pipeline.
##### Technologies Used:
##### Pandas: For data extraction, cleaning, and transformation.
##### psycopg2 & SQLAlchemy: For establishing database connections and loading data into PostgreSQL.
##### Key Features:
##### Reads and inspects raw data from worldometer_data.csv.
##### Identifies and resolves data issues, such as duplicate entries.
##### Identifies and resolves data issues, such as duplicate entries.
##### Prepares the transformed data for seamless integration with a PostgreSQL database.
##### Outcome: Successfully showcases the complete data pipeline process, integrating Python-based transformations with relational databases to ensure data accuracy and usability.
