<a href="https://colab.research.google.com/github/Kenny08DA/Datasets/blob/main/Statistics_4_%5BUPB%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importar las librerias necesarias
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
# import seaborn as sns

%matplotlib inline

In [None]:
mpl.style.use(['ggplot'])

In [None]:
df = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")
df.sample(10)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Creamos un nuevo Dataframe con las columnas de interes

df_covid = df[['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
               'total_deaths', 'new_deaths', 'new_tests', 'total_tests', 'tests_per_case',
               'positive_rate', 'tests_units', 'stringency_index', 'population',
               'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
               'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate',
               'diabetes_prevalence', 'female_smokers', 'male_smokers',
               'handwashing_facilities', 'hospital_beds_per_thousand', 'life_expectancy']]
df_covid.head()

In [None]:
# Identificamos Missing Values

missing_data = df_covid.isnull()
missing_data.head()

In [None]:
# for column in missing_data.columns.values.tolist():
#   print(column)
#   print(missing_data[column].value_counts())
#   print("")

In [None]:
# Nos vamos a quedar solo con las columnas de los paises, vamos a ignorar las agregaciones
to_drop = df_covid[df_covid['continent'].isnull()]
df_covid.drop(axis = 0, index = to_drop.index, inplace = True)

to_drop

In [None]:
# Aplicaremos una transformación básica a los datos
df_covid["date"] = pd.to_datetime(df_covid["date"], format="%Y-%m-%d")
# Completaremos con 0 los valores nulos en las columnas : "new_cases", "new_deaths", "new_tests"
df_covid[["new_cases", "new_deaths", "new_tests"]] = df_covid[["new_cases", "new_deaths", "new_tests"]].fillna(0)


Tenemos nulos en total_cases? total_deaths ? total_tests ?

In [None]:
# Para llenar los valores perdidos vamos a corregir las sumas acumuladas
df_covid_cleaned = pd.DataFrame()
for country in df_covid["iso_code"].unique():
  # print(country)
  df_country = df_covid[df_covid["iso_code"] == country].sort_values(["date"]).copy()

  df_country["total_cases"] = df_country["new_cases"].cumsum()
  df_country["total_deaths"] = df_country["new_deaths"].cumsum()
  df_country["total_tests"] = df_country["new_tests"].cumsum()

  # df_covid_cleaned = df_covid_cleaned.append(df_country,ignore_index=True)
  df_covid_cleaned = pd.concat([df_covid_cleaned, df_country ], ignore_index=True)


In [None]:
df_covid = df_covid_cleaned.copy()

In [None]:
# Analicemos la data de Bolivia
df_bolivia = df_covid[df_covid["iso_code"] == "BOL"]
df_bolivia.tail(10)

In [None]:
import plotly.express as px
px.line(df_bolivia, x='date', y='total_cases')

In [None]:
# Analicemos los datos de south america
df_sa = df_covid.loc[df_covid['continent'] == "South America", ['date','iso_code', 'new_cases']]
df_sa = df_sa.pivot(index="date", columns="iso_code", values="new_cases")

df_sa.plot(kind = 'area',
             stacked = False,
             figsize = (20, 10))

plt.title('Casos nuevos por pais')
plt.ylabel('Casos nuevos')
plt.xlabel('Fecha')

plt.show()


Tarea
- Genera el line plot para Ecuador

In [None]:
px.bar(df_bolivia, x='date', y='total_deaths')

In [None]:
df_bolivia.head()

In [None]:
df_BO = df_bolivia[['date', 'new_cases', 'new_deaths', 'new_tests']].set_index('date')
df_BO.head()

In [None]:
df_BO.plot(kind = 'line', figsize = (10, 6), color = [ 'red', 'darkblue', 'mediumseagreen'])
plt.show()

In [None]:
# Por pais, vamos a obtener la informacion del ultimo dia
df_latest_day = df_covid.sort_values(["iso_code", "date"], ascending=False).drop_duplicates(["iso_code"], keep='first')
df_latest_day

In [None]:
top_countries = df_latest_day[["location", "total_cases"]].set_index("location").sort_values('total_cases', ascending=True).tail(15)
top_countries.plot(kind = 'barh', figsize = (16, 12), color = 'steelblue')

plt.title('Top 15 de paises con mayor cantidad de casos')
plt.xlabel('Numero de infectados')

for index, value in enumerate(top_countries["total_cases"]):
    label = format(int(value), ',') # format int with commas

    plt.annotate(label, xy = (value, index - 0.1), color = 'black')

plt.show()

In [None]:
df_latest_day_continent = df_latest_day.drop(columns=['date']).groupby("continent").sum()

In [None]:
df_latest_day_continent

In [None]:
colors_list = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen', 'pink']
explode_list = [0, 0, 0, 0.1, 0, 0.1] # Ratio for each continent with which to offset each wedge

df_latest_day_continent['total_cases'].plot(kind = 'pie',
                            figsize = (15,6),
                            autopct = '%1.1f%%',
                            startangle = 90,
                            shadow = True,
                            labels = None,           # Turn off labels on pie chart
                            pctdistance = 1.12,      # The ratio between the center of each pie slice and the start of the next text generated by autopct
                            colors = colors_list,      # Add custom colors
                            explode = explode_list   # 'explode' lowest 3 continents
                           )


# scale the title up by 12% to match pctdistance

plt.title('Porcentaje de infectados por continente', y = 1.12)

plt.axis('equal')



plt.legend(labels = df_latest_day_continent.index, loc = 'upper left')

plt.show()

In [None]:
df_tests = df_covid.loc[df_covid['continent'] == "South America", ['date','iso_code', 'new_tests']]
df_tests

In [None]:
px.box(df_tests, x='iso_code', y='new_tests', orientation='v',  )