# Pandas Tutorial/ Tutorial Pandas

In [6]:
# Importing pandas/importando o pandas

import pandas as pd
import numpy as np

In [3]:
# Checking pandas version/Checando a versão do pandas
pd.__version__

'1.1.3'

In [5]:
# Package description/Descrição do pacote
pd?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'pandas' from '/home/luan/anaconda3/lib/python3.8/site-packages/pandas/__init__.py'>
[0;31mFile:[0m        ~/anaconda3/lib/python3.8/site-packages/pandas/__init__.py
[0;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data

# Introducing Pandas Objects/Introduzindo os objetos do pandas

## Series

In [7]:
data = pd.Series([1,2,3,4,5])
data

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [9]:
# Obtaining Series values/Obtendo os valores do objeto series 
print(data.values)

[1 2 3 4 5]


In [13]:
# Obtaining the index/Obtendo os índices
data.index


RangeIndex(start=0, stop=5, step=1)

In [20]:
#Accessing data/Acessando os dados
print('Element {} from Series: {}\n'.format(0,data[0]))
# Using slices/Fatiando o array
print('Series Elements:\n')
print(data[0:4])

Element 0 from Series: 1

Series Elements:

0    1
1    2
2    3
3    4
dtype: int64


## Series as a generalization of numpy array/Series como uma generalização do numpy array

In [24]:
data = pd.Series([5,10,15,20,25],index = ['a','b','c','d','e'])
data

a     5
b    10
c    15
d    20
e    25
dtype: int64

In [25]:
# Accessing value by index/Acessando o valor pelo índice
data['a']

5

In [29]:
# The indices can be non sequential/Os índices podem ser não-sequenciais
data.index = ['a',1,'b',2,'c']
print(data['a'],data[1],data['b'],data[2],data['c'])

5 10 15 20 25


### Series as specialized dictionary/Series como um dicionário especializado

In [32]:
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)

In [40]:
population['California']

38332521

In [39]:
#With Series is possible to perform array slicing in non-numeric data/ Com Series é possivel fatiar arrays tal como no numpy
population['California':'Florida']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

In [44]:
# By default, index is a integer sequence/Por padrão o índice é uma sequência de inteiros
pd.Series(range(2,7,2))

0    2
1    4
2    6
dtype: int64

In [45]:
# If data is a single value and multiple indices are given, the value is repeated to fill the indexes/ Caso o dado seja único e os índices múltiplos, o valor é preenchido ao longo dos índices
pd.Series(10,index = range(3))

0    10
1    10
2    10
dtype: int64

In [50]:
# As seen before, data can be a dictionary, and its keys can be choosed by informing it in index argument/Como visto antes, os dados podem ser um dicionário e suas chaves podem ser escolhidas por meio do argumento index:
pd.Series({2:'a',1:'b',3:'c'},index = [3,2])


3    c
2    a
dtype: object

## The Pandas DataFrame Object/ O objeto DataFrame

In [51]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}

In [52]:
area = pd.Series(area_dict)

In [53]:
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [57]:
states = pd.DataFrame({'population':population,'area':area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [58]:
#accessing the indices/acessando os índices:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [59]:
#acessing the columns
states.columns

Index(['population', 'area'], dtype='object')

### DataFrame as specialized dictionary/DataFrame como um dicionário especializado.

In [62]:
print('state área: \n')
states['area']

state área: 



California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Constructing DataFrame objects

In [63]:
# From a single series/A partir de um único series
pd.DataFrame(population,columns = ['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [65]:
# From a list of dicts/A partir de uma lista de dicionários

data = [{'a':i,'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [67]:
# If some keys in the dictionary are missing, Pandas will fill them with NaN/ Faltando chaves, o pandas as preenche com NaNs

pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [68]:
# From a dictionary of Series objects
pd.DataFrame({'population': population,
'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [69]:
# From a NumPy structured array
pd.DataFrame(np.random.rand(3,2),columns = ['foo','bar'],index = ['a','b','c'])

Unnamed: 0,foo,bar
a,0.393785,0.748022
b,0.461537,0.900278
c,0.760372,0.002876


In [72]:
# From a NumPy structured array/A partir de um array estruturado
A = np.zeros(3,dtype = [('A','i8'),('B','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [73]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## Pandas Index Object/ O objeto index do pandas

In [75]:
# Index => immutable array or ordered set

In [76]:
ind = pd.Index([2,3,4,5,7,11])

### Index as immutable array

In [79]:
# its access is equal to ordinary lists/seu acesso é idêntico às listas
ind[1]

3

In [81]:
# Is also possible to perform array slicing/ Aqui também pode-se fatiar um array
ind[::-1]

Int64Index([11, 7, 5, 4, 3, 2], dtype='int64')

In [82]:
# The main difference it is the fact that the Index object are immutable/A principal diferença é o fato do objeto Index ser imutável
ind[1]=0

TypeError: Index does not support mutable operations

### Index as ordered set/Index as ordered set

In [97]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])
print('Intersection: ',indA & indB) #intersection
print('Union: ',indA | indB)
print('Symmetric Difference: ',indA^indB)

Intersection:  Int64Index([3, 5, 7], dtype='int64')
Union:  Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Symmetric Difference:  Int64Index([1, 2, 9, 11], dtype='int64')


In [103]:
#Using methods
print('Intersection:    ',indA.intersection(indB))
print('Union:   ',indA.union(indB))
print('Symmetric Difference:    ',indA.symmetric_difference(indB))

Intersection:     Int64Index([3, 5, 7], dtype='int64')
Union:    Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Symmetric Difference:     Int64Index([1, 2, 9, 11], dtype='int64')
