# Disciplina de Mineração de Dados

## Introdução aos Pandas

Universidade Federal de Sergipe, Campus Prof. Alberto Carvalho - Itabaiana

Professores:

- Raphael Silva Fontes

- Prof. Dr. Methanias Colaço Rodrigues Júnior

--

Nesta módulo da disciplina aprenderemos a usar o Pandas para análise de dados. O pandas é como uma versão extremamente poderosa do Excel, com muito mais recursos.

Aprenderemos:
* Series
* DataFrames
* Dados ausentes
* GroupBy
* Mesclar, Juntar e Concatenar
* Operações
* Entrada e saída de dados

### Series

In [None]:
import pandas as pd

In [None]:
labels = ['a', 'b', 'c']
dict_ = {'a': 10, 'b': 20, 'c': 30}
values = [1, 2, 3]

In [None]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [None]:
pd.Series(dict_)

a    10
b    20
c    30
dtype: int64

In [None]:
pd.Series(values, labels)

a    1
b    2
c    3
dtype: int64

In [None]:
pd.Series([sum, print, len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [None]:
serie = pd.Series([1,2,3,4], index=['EUA', 'Alemanha','USSR', 'Japão'])

In [None]:
serie

EUA         1
Alemanha    2
USSR        3
Japão       4
dtype: int64

In [None]:
serie['EUA']

1

In [None]:
serie + serie 

EUA         2
Alemanha    4
USSR        6
Japão       8
dtype: int64

In [None]:
serie2 = pd.Series([1,2,5,4],index = ['EUA', 'Alemanha','Italia', 'Japão'])

In [None]:
serie + serie2

Alemanha    4.0
EUA         2.0
Italia      NaN
Japão       8.0
USSR        NaN
dtype: float64

### DataFrame

In [None]:
import numpy as np

In [None]:
from numpy.random import randn
np.random.seed(101)

In [None]:
randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [None]:
df = pd.DataFrame(randn(5,4), index=['A', 'B', 'C', 'D', 'E'], columns=['W', 'X', 'Y', 'Z'])

In [None]:
df[['W', 'Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905
C,0.807706,0.638787
D,-0.497104,-0.943406
E,-0.116773,0.238127


In [None]:
type(df['W'])

pandas.core.series.Series

In [None]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [None]:
df['nova_coluna'] = df['W'] + df['Y']

In [None]:
df = df.drop(['nova_coluna'], axis=1)

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [None]:
df['nova_coluna'] = df['W'] + df['Y']

In [None]:
df.drop(['nova_coluna'], axis=1, inplace=True)

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [None]:
df.drop(['E'])

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481


In [None]:
df.loc['A']

W   -0.993263
X    0.196800
Y   -1.136645
Z    0.000366
Name: A, dtype: float64

In [None]:
df.iloc[2]

W    2.154846
X   -0.610259
Y   -0.755325
Z   -0.346419
Name: C, dtype: float64

In [None]:
df.loc['B', 'Y']

-0.031579143908112575

In [None]:
df.loc[df['Y'] == -0.031579143908112575]

Unnamed: 0,W,X,Y,Z
B,1.025984,-0.156598,-0.031579,0.649826


### Seleção Condicional

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [None]:
df > 0

Unnamed: 0,W,X,Y,Z
A,False,True,False,True
B,True,False,False,True
C,True,False,False,False
D,True,False,True,True
E,False,True,False,True


In [None]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,0.1968,,0.000366
B,1.025984,,,0.649826
C,2.154846,,,
D,0.147027,,0.558769,1.02481
E,,1.862864,,0.610478


In [None]:
df[df['Y']>0]

Unnamed: 0,W,X,Y,Z
D,0.147027,-0.479448,0.558769,1.02481


In [None]:
df[(df['W'] > 1) & (df['Z'] > 0.4)]

Unnamed: 0,W,X,Y,Z
B,1.025984,-0.156598,-0.031579,0.649826


In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [None]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-0.993263,0.1968,-1.136645,0.000366
1,B,1.025984,-0.156598,-0.031579,0.649826
2,C,2.154846,-0.610259,-0.755325,-0.346419
3,D,0.147027,-0.479448,0.558769,1.02481
4,E,-0.925874,1.862864,-1.133817,0.610478


In [None]:
estados = ['SE', 'BA', 'AL', 'CE', 'RN']

In [None]:
df['estados'] = estados

In [None]:
df

Unnamed: 0,W,X,Y,Z,estados
A,-0.993263,0.1968,-1.136645,0.000366,SE
B,1.025984,-0.156598,-0.031579,0.649826,BA
C,2.154846,-0.610259,-0.755325,-0.346419,AL
D,0.147027,-0.479448,0.558769,1.02481,CE
E,-0.925874,1.862864,-1.133817,0.610478,RN


In [None]:
df = df.set_index('estados')

In [None]:
df

Unnamed: 0_level_0,W,X,Y,Z
estados,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SE,-0.993263,0.1968,-1.136645,0.000366
BA,1.025984,-0.156598,-0.031579,0.649826
AL,2.154846,-0.610259,-0.755325,-0.346419
CE,0.147027,-0.479448,0.558769,1.02481
RN,-0.925874,1.862864,-1.133817,0.610478


In [None]:
df.loc['SE']

W   -0.993263
X    0.196800
Y   -1.136645
Z    0.000366
Name: SE, dtype: float64

### Dados ausentes

In [None]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [None]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [None]:
df.loc[df['B'].isna()]

Unnamed: 0,A,B,C
1,2.0,,2
2,,,3


In [None]:
df.loc[df['B'].notna()]

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [None]:
df.fillna(value=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,0.0,2
2,0.0,0.0,3


In [None]:
df['A'].fillna(df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

### GroupBy

In [None]:
df = {'Empresa':['GOOG','GOOG','MSFT','MSFT','FB','FB', 'FB'],
       'Nome':['Sam','Charlie','Amy','Vanessa','Carl','Sarah', 'Alcymar'],
       'Venda':[200,120,340,124,243,350, 500]}

In [None]:
df = pd.DataFrame(df)

In [None]:
df

Unnamed: 0,Empresa,Nome,Venda
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350
6,FB,Alcymar,500


In [None]:
df.groupby('Empresa')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015DD6C556C8>

In [None]:
df.groupby('Empresa').max()

Unnamed: 0_level_0,Nome,Venda
Empresa,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Sarah,500
GOOG,Sam,200
MSFT,Vanessa,340


In [None]:
(243 + 350) / 2

296.5

In [None]:
df.groupby('Empresa').count()

Unnamed: 0_level_0,Nome,Venda
Empresa,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,3,3
GOOG,2,2
MSFT,2,2


In [None]:
df.describe()

Unnamed: 0,Venda
count,7.0
mean,268.142857
std,137.611081
min,120.0
25%,162.0
50%,243.0
75%,345.0
max,500.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Empresa  7 non-null      object
 1   Nome     6 non-null      object
 2   Venda    7 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 296.0+ bytes


In [None]:
df.loc[(df['Nome'].str.startswith('A')) | (df['Nome'].str.startswith('S'))].mean()

Venda    347.5
dtype: float64

In [None]:
df.query("Nome == 'Alcymar'")

Unnamed: 0,Empresa,Nome,Venda


In [None]:
df

Unnamed: 0,Empresa,Nome,Venda
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350
6,FB,Acymar,500


### Mesclar, Juntar e Concatenar

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [None]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [None]:
df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
                        'F': ['B4', 'B5', 'B6', 'B7'],
                        'G': ['C4', 'C5', 'C6', 'C7'],
                        'H': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])

In [None]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [None]:
df3 = pd.DataFrame({'E': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [None]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


#### Concatenação

In [None]:
pd.concat([df2, df1, df3])

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [None]:
pd.merge(df1, df2, how='left', on='B')

Unnamed: 0,A_x,B,C_x,D_x,A_y,C_y,D_y
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,,,
3,A3,B3,C3,D3,,,


In [None]:
df1.join(df2, how='outer')

Unnamed: 0,A,B,C,D,E,F,G,H
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5
6,,,,,A6,B6,C6,D6
7,,,,,A7,B7,C7,D7


### Entrada e Saíde de Dados

In [None]:
df = pd.read_csv('Pandas/exemplo.csv')

In [None]:
df = df.loc[df['c'] >= 5]

In [None]:
df.to_csv('exemplo.csv', index=False)

In [None]:
df = pd.read_csv('exemplo.csv')

In [None]:
df

Unnamed: 0.1,Unnamed: 0,a,b,c,d
0,1,4,5,6,7
1,2,8,9,10,11
2,3,12,13,14,15


In [None]:
pessoa = ['Hoje fui assaltado', 'Minha nota da escola foi na média', 'Meu celular quebrou', 'Meu cachorro ficou doente', 
          'Ah hoje dia estava lindo!!']