In [None]:
# Problem for Covid-19 Data Analysis Project using Python

## Importing Libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
##1 Importing the Dataset

url = "https://raw.githubusercontent.com/SR1608/Datasets/main/coviddata.csv"
df = pd.read_csv(url)


In [None]:
###2 High level data Understanding:

### a. Find no. of rows & columns in the datase
rows, columns = df.shape
print("Number of rows:", rows)
print("Number of columns:", columns)

### b. Data types of columns
print(df.dtypes)

### c. Info & describe of data in dataframe
print(df.info())
print(df.describe())


In [None]:
##3 Low Level Data Understanding

### a. Find count of unique values in location column
unique_locations = df['location'].nunique()
print("Number of unique locations:", unique_locations)

### b. Find which continent has maximum frequency using values counts
continent_frequency = df['continent'].value_counts()
max_frequency_continent = continent_frequency.idxmax()
print("Continent with maximum frequency:", max_frequency_continent)

### c. Find maximum & mean value in 'total_cases'
max_total_cases = df['total_cases'].max()
mean_total_cases = df['total_cases'].mean()
print("Maximum total cases:", max_total_cases)
print("Mean total cases:", mean_total_cases)

### d. Find 25%, 50% & 75% quartile value in 'total_deaths'
quartiles_total_deaths = df['total_deaths'].quantile([0.25, 0.5, 0.75])
print("25% quartile value:", quartiles_total_deaths[0.25])
print("50% quartile value:", quartiles_total_deaths[0.5])
print("75% quartile value:", quartiles_total_deaths[0.75])

### e. Find which continent has maximum 'human_development_index'
max_hdi_continent = df.loc[df['human_development_index'].idxmax(), 'continent']
print("Continent with maximum human development index:", max_hdi_continent)

### f. Find which continent has minimum 'gdp_per_capita'
min_gdp_continent = df.loc[df['gdp_per_capita'].idxmin(), 'continent']
print("Continent with minimum GDP per capita:", min_gdp_continent)


In [None]:
##4 Filter the Dataframe
new_columns = ['continent', 'location', 'date', 'total_cases', 'total_deaths', 'gdp_per_capita', 'human_development_index']
df_filtered = df[new_columns].copy()


In [None]:
##5 Data Cleaning

### a. Remove all duplicate observations
df_filtered.drop_duplicates(inplace=True)

### b. Find missing values in all columns
missing_values = df_filtered.isnull().sum()
print(missing_values)

### c. Remove all observations where continent column value is missing
df_filtered.dropna(subset=['continent'], inplace=True)

### d. Fill all missing values with 0
df_filtered.fillna(0, inplace=True)

In [None]:
##6 Date time format

### a. Convert date column in datetime format using pandas.to_datetime
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

### b. Create new column month after extracting month data from date column
df_filtered['month'] = df_filtered['date'].dt.month


In [None]:
##7 Data Aggregation
df_groupby = df_filtered.groupby('continent').max().reset_index()

In [None]:
##8 Feature Engineering

### a. Create a new feature 'total_deaths_to_total_cases' by ratio of 'total_deaths' column to 'total_cases'
df_groupby['total_deaths_to_total_cases'] = df_groupby['total_deaths'] / df_groupby['total_cases']


In [None]:
##9 Data Visualization

### a. Perform Univariate analysis on 'gdp_per_capita' column by plotting histogram using seaborn dist plot
sns.distplot(df_groupby['gdp_per_capita'])
plt.show()

### b. Plot a scatter plot of 'total_cases' & 'gdp_per_capita'
plt.scatter(df_groupby['total_cases'], df_groupby['gdp_per_capita'])
plt.xlabel('Total Cases')
plt.ylabel('GDP per Capita')
plt.show()

### c. Plot Pairplot on df_groupby dataset.
sns.pairplot(df_groupby)
plt.show()

### d. Plot a bar plot of 'continent' column with 'total_cases'
sns.catplot(x='continent', y='total_cases', kind='bar', data=df_groupby)
plt.show()


In [None]:
##10 Save the df_groupby dataframe to Local Drive
df_groupby.to_csv('df_groupby.csv', index=False)