# Final Project: 7.1 

This is the final project for Statistical Data Science. 

- Understanding fundamental data science concepts, data acquisition, cleaning, exploration, and visualization. 
- Applying Python programming and its libraries for data manipulation and analysis. 
- Utilizing basic statistical methods for analyzing and interpreting data effectively. 
- Demonstrating proficiency in data preprocessing techniques, including handling missing values and outliers appropriately. 
- Developing skills in data visualization to communicate insights effectively from data exploration. 
- Applying machine learning techniques for predictive modeling and interpreting patterns to communicate findings to diverse audiences. 

**Student Name:** OLIVIA ANDERSON
<br>
**Student Name:** ROY WILLIAMS


In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import os
import pandas as pd
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)


## Load Dataset

Load the dataset into a pandas DataFrame. Then display the 10 rows of the head or tail of the Dataframe calling the display function.

In [2]:
data_path = os.path.join('..', '..', 'data', '01_raw', 'Worldwide Vaccine Data.csv')
df = pd.read_csv(data_path)

# Give the existing index a name
df.index.name = 'Index'

html = df.head(10).to_html(max_cols=None)
display(HTML(f'<div style="overflow-x:auto">{html}</div>'))

Unnamed: 0_level_0,Country,Doses administered per 100 people,Total doses administered,% of population vaccinated,% of population fully vaccinated
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Afghanistan,33,12526397,30.0,28.0
1,Albania,106,3025728,47.0,44.0
2,Algeria,35,15267442,18.0,15.0
3,Angola,74,23701049,47.0,26.0
4,Argentina,252,113272665,92.0,84.0
5,Armenia,73,2150112,38.0,33.0
6,Aruba,164,174215,85.0,79.0
7,Australia,251,63634307,88.0,85.0
8,Austria,228,20263306,78.0,77.0
9,Azerbaijan,138,13857111,54.0,49.0


## Understand the Data

Examine the structure, data types, and summary statistics of the dataset.

In [6]:
import plotly.graph_objects as go
import plotly.express as px


# Display shape 
shape_df = pd.DataFrame({'Rows': [df.shape[0]], 'Columns': [df.shape[1]]})
fig_shape = go.Figure(data=[go.Table(
    header=dict(values=list(shape_df.columns), fill_color='paleturquoise', align='left'),
    cells=dict(values=[shape_df.Rows, shape_df.Columns], fill_color='lavender', align='left'))
])
fig_shape.update_layout(title_text='DataFrame Shape')
fig_shape.show()

# Styled summary 
describe_df = df.describe().T.reset_index().rename(columns={'index': 'Feature'})
fig_desc = go.Figure(data=[go.Table(
    header=dict(values=list(describe_df.columns), fill_color='lightblue', align='left'),
    cells=dict(values=[describe_df[col] for col in describe_df.columns], fill_color='white', align='left'))
])
fig_desc.update_layout(title_text='Summary Statistics')
fig_desc.show()

# Display columns and their data types
dtypes_df = pd.DataFrame({'Column': df.columns, 'Data Type': df.dtypes.astype(str).values})


fig_cols = go.Figure(data=[go.Table(
    header=dict(values=list(dtypes_df.columns), fill_color='lightgrey', align='left'),
    cells=dict(values=[dtypes_df['Column'], dtypes_df['Data Type']], fill_color='white', align='left'))
])
fig_cols.update_layout(title_text='Columns and Their Data Types')
fig_cols.show()

# 3. Visualize data types
# Count data types
dtype_counts = df.dtypes.value_counts()
dtype_names = dtype_counts.index.astype(str)
fig_dtype = px.bar(x=dtype_names, y=dtype_counts.values, labels={'x': 'Data Type', 'y': 'Count'},
                  title='Data Types Count', color=dtype_names, color_discrete_sequence=px.colors.qualitative.Pastel)
fig_dtype.update_layout(showlegend=False)
fig_dtype.show()