In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import Image, display

#Make the graphs a bit prettier
plt.style.use('ggplot')

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# How to pandas

In [None]:
display(Image(filename='data/images/cute_panda.jpg'))

## What is pandas and what is it used for?

This tutorial is heavily based on [link1](https://www.datacamp.com/tutorial/pandas). Other sources include: [link2](https://www.edlitera.com/blog/posts/pandas-vs-excel-comparison#mcetoc_1gcticmbab) and [link3](https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-data-wrangling-in-python).

* Most important Python package for data analysis (100 million downloads per month)
    * cleaning, aggregating and analyzing data
* works with tabular data (rows and columns - much like an excel sheet)
* exemplary data manipulations: 
    * sorting rows, taking subsets, calculating summary statistics (e.g. mean, median etc.), reshaping data frames, joining data frames
* works well with other popular Python data science packages such as 
    * NumPy (for numerical computing)
    * Matplotlib, Seaborn, Plotly (for data visualization)
    * scikit-learn for machine learning
    
### Used for...
* Importing datasets from databases, spreadsheets, comma-separated values (CSV) files, and more
* Cleaning datasets, for example, by dealing with missing values
* Tidying datasets by reshaping their structure into a suitable format for analysis
* Aggregating data by calculating summary statistics such as the mean of columns, correlation between them, and more.
* Visualizing datasets and uncovering insights
* for time series analysis and text analysis

## Why is it better than excel?

* Scalability -  Pandas is only limited by hardware and can manipulate larger quantities of data.
* Speed -  Pandas is much faster than Excel, which is especially noticeable when working with larger quantities of data.
* Automation -  A lot of the tasks that can be achieved with Pandas are extremely easy to automate, reducing the amount of tedious and repetitive tasks that need to be performed daily.
* Interpretability -  It is very easy to interpret what happens when each task is run, and it is relatively easy to find and fix errors.
* Advanced Functions - Performing advanced statistical analysis and creating complex visualizations is very straightforward, easy transition to machine learning analyses.

## Pandas data structures

In [None]:
display(Image(filename='data/images/pandas-structures.png'))

In [None]:
display(Image(filename='data/images/pandas-df.png'))

# Getting started

In [None]:
display(Image(filename='data/images/cute_panda_working.jpg'))

## Installation

`pip install pandas`

`conda install pandas`

In [None]:
import pandas as pd

## Importing data from csv files

In [None]:
df2019 = pd.read_csv("data/happiness/2019.csv")

In [None]:
df2019

## Importing data from excel files (single sheet)

In [None]:
df2019 = pd.read_excel("data/happiness/2019.xlsx")

In [None]:
df2019

## Importing data from excel files (multiple sheets)

In [None]:
df2018 = pd.read_excel("data/happiness/2018_2019.xlsx", sheet_name="2018")
df2019 = pd.read_excel("data/happiness/2018_2019.xlsx", sheet_name="2019")

## Saving a dataframe to a csv/tsv file

In [None]:
df2018.to_csv("data/happiness/2018.csv", index=False)

## Saving a dataframe to an excel file

In [None]:
df2018.to_excel("data/happiness/2018.xlsx", index=False)

In [None]:
# # Multiple sheets

# # Create a Pandas Excel writer using XlsxWriter as the engine.
# writer = pd.ExcelWriter("data/happiness/2018_2019.xlsx", engine="xlsxwriter")

# df2018.to_excel(writer, index=False, sheet_name="2018")
# df2019.to_excel(writer, index=False, sheet_name="2019")

# # Close the Pandas Excel writer and output the Excel file.
# writer.close()

## Other fileformats

In [None]:
df2019 = pd.read_csv("data/happiness/2019.tsv")

In [None]:
#df2019

In [None]:
df2019 = pd.read_csv("data/happiness/2019.tsv", sep="\t")

In [None]:
df2019.to_csv("data/happiness/2019.tsv", sep="\t")

# Viewing and understanding data frames

In [None]:
display(Image(filename='data/images/cute_panda_looking.jpg'))

In [None]:
df2019.head()

In [None]:
df2019.tail(10)

In [None]:
# df2019[::-1]

## Descriptive statistics

In [None]:
#pd.set_option('display.precision', 2)

In [None]:
df2019.describe()

In [None]:
df2019.describe(include=[float])

In [None]:
df2019.describe().T

In [None]:
df2019.info(show_counts=True, memory_usage=True, verbose=True)

## Dataframe dimensions

In [None]:
df2019.shape

In [None]:
len(df2019)

## Column names

In [None]:
df2019.columns

## Null values

In [None]:
df2019.isnull()

In [None]:
# Get number of null values per column
df2019.isnull().sum()

In [None]:
## Add some null values to the dataframe

df2019_withNulls = df2019.copy()
df2019_withNulls.loc[2:6, "Score"] = None

In [None]:
df2019_withNulls.isnull().sum()

In [None]:
# Get number of null values in total
df2019_withNulls.isnull().sum().sum()