# Reshaping dos Dados

In [1]:
import pandas as pd
import numpy as np

In [2]:
datas = pd.date_range('20210101', periods=6)

df = pd.DataFrame(np.random.randn(6,4), index=datas, columns=['Var_A', 'Var_B', 'Var_C', 'Var_D'])

In [3]:
df

Unnamed: 0,Var_A,Var_B,Var_C,Var_D
2021-01-01,-0.942293,-0.462112,1.643704,-0.903428
2021-01-02,-0.34955,-1.358019,0.994856,2.278816
2021-01-03,0.624007,-1.262884,0.10261,1.20789
2021-01-04,-1.005808,-0.250023,-0.238532,1.223777
2021-01-05,0.163912,1.339345,-1.628145,-0.677282
2021-01-06,-0.527926,-0.81016,-0.259237,-0.100195


In [4]:
# transpondo colunas e index

dft = df.T

dft

Unnamed: 0,2021-01-01,2021-01-02,2021-01-03,2021-01-04,2021-01-05,2021-01-06
Var_A,-0.942293,-0.34955,0.624007,-1.005808,0.163912,-0.527926
Var_B,-0.462112,-1.358019,-1.262884,-0.250023,1.339345,-0.81016
Var_C,1.643704,0.994856,0.10261,-0.238532,-1.628145,-0.259237
Var_D,-0.903428,2.278816,1.20789,1.223777,-0.677282,-0.100195


In [5]:
# vendo o formato comum dos df

df.shape

(6, 4)

In [6]:
# extraindo apenas os valores numericos de um dataframe

dft.values

array([[-0.94229344, -0.34954981,  0.6240067 , -1.00580801,  0.16391207,
        -0.52792599],
       [-0.46211151, -1.35801892, -1.26288432, -0.25002263,  1.33934459,
        -0.81016009],
       [ 1.64370446,  0.99485629,  0.10260962, -0.23853168, -1.62814479,
        -0.25923746],
       [-0.90342755,  2.27881614,  1.20788991,  1.22377704, -0.6772818 ,
        -0.10019528]])

In [7]:
# vendo a quantidade de elementos dentro do dataframe

np.size(dft.values)

24

In [8]:
# usando a função reshape

v = dft.values

v

array([[-0.94229344, -0.34954981,  0.6240067 , -1.00580801,  0.16391207,
        -0.52792599],
       [-0.46211151, -1.35801892, -1.26288432, -0.25002263,  1.33934459,
        -0.81016009],
       [ 1.64370446,  0.99485629,  0.10260962, -0.23853168, -1.62814479,
        -0.25923746],
       [-0.90342755,  2.27881614,  1.20788991,  1.22377704, -0.6772818 ,
        -0.10019528]])

In [9]:
v.reshape((2,12))

array([[-0.94229344, -0.34954981,  0.6240067 , -1.00580801,  0.16391207,
        -0.52792599, -0.46211151, -1.35801892, -1.26288432, -0.25002263,
         1.33934459, -0.81016009],
       [ 1.64370446,  0.99485629,  0.10260962, -0.23853168, -1.62814479,
        -0.25923746, -0.90342755,  2.27881614,  1.20788991,  1.22377704,
        -0.6772818 , -0.10019528]])

# Função Pivot()

In [10]:
dias = pd.date_range(start='20190101', periods=12)

In [11]:
dias

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12'],
              dtype='datetime64[ns]', freq='D')

In [12]:
pessoa = ['George', 'Vitor', 'Lucas']

In [13]:
# escolha aleatoria dentro dessa lista de pessoas

np.random.choice(pessoa)

'Lucas'

In [14]:
# criando um FOR e arredondando as casas decimais dos numeros

nome = []
gasto = []
for i in range(12):
    nome.append(np.random.choice(pessoa))
    gasto.append(np.round(np.random.rand()*100,2))
    
nome

['George',
 'George',
 'Vitor',
 'Vitor',
 'Lucas',
 'Vitor',
 'Vitor',
 'George',
 'Lucas',
 'Lucas',
 'Lucas',
 'Vitor']

In [15]:
gasto

[16.28,
 7.52,
 1.75,
 83.11,
 95.97,
 7.14,
 55.04,
 56.45,
 53.62,
 24.88,
 45.34,
 84.96]

In [16]:
# agrupando as variaveis agora em um único dataframe

df = pd.DataFrame({'Dia': dias, 'Nome': nome, "Gasto": gasto})

df

Unnamed: 0,Dia,Nome,Gasto
0,2019-01-01,George,16.28
1,2019-01-02,George,7.52
2,2019-01-03,Vitor,1.75
3,2019-01-04,Vitor,83.11
4,2019-01-05,Lucas,95.97
5,2019-01-06,Vitor,7.14
6,2019-01-07,Vitor,55.04
7,2019-01-08,George,56.45
8,2019-01-09,Lucas,53.62
9,2019-01-10,Lucas,24.88


In [17]:
# alterando para função pivot do dataframe

df.pivot(index='Dia', columns='Nome', values='Gasto')

Nome,George,Lucas,Vitor
Dia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,16.28,,
2019-01-02,7.52,,
2019-01-03,,,1.75
2019-01-04,,,83.11
2019-01-05,,95.97,
2019-01-06,,,7.14
2019-01-07,,,55.04
2019-01-08,56.45,,
2019-01-09,,53.62,
2019-01-10,,24.88,


# Função Pivot_Table

In [18]:
carros = [7,4,3,2,8]
dias = pd.date_range('20190101', '20190101', periods=5)
vendedor = ['george', 'wagner', 'pedro', 'vagner', 'george']

df = pd.DataFrame({'vendas': carros, 'data': dias, 'vendedor': vendedor})

df

Unnamed: 0,vendas,data,vendedor
0,7,2019-01-01,george
1,4,2019-01-01,wagner
2,3,2019-01-01,pedro
3,2,2019-01-01,vagner
4,8,2019-01-01,george


In [19]:
# função pivot não aceita valores duplicados - nome do vendedor no caso

pd.pivot(df, index='data', columns='vendedor', values='vendas')

ValueError: Index contains duplicate entries, cannot reshape

In [20]:
# pivt_table consegue lidar com os valores duplicados, agrupando os dados - por definição ela faz aggfunc = 'mean' mas pode ser alterado

pd.pivot_table(df, index='data', columns='vendedor', values='vendas')

vendedor,george,pedro,vagner,wagner
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,7.5,3.0,2.0,4.0


# Stack de Dados

In [21]:
# colocar os dados empilhados

df = pd.read_csv('nba.csv')

df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [22]:
df.shape

(458, 9)

In [23]:
# quero fazer um reshape desses dados mudando a visualização

stack_df = df.stack()

stack_df

0    Name         Avery Bradley
     Team        Boston Celtics
     Number                   0
     Position                PG
     Age                     25
                      ...      
456  Age                     26
     Height                 7-0
     Weight                 231
     College             Kansas
     Salary              947276
Length: 4018, dtype: object

In [24]:
# retornando os dados para o visual anterior

stack_df.unstack()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,6-2,180,Texas,7.73034e+06
1,Jae Crowder,Boston Celtics,99,SF,25,6-6,235,Marquette,6.79612e+06
2,John Holland,Boston Celtics,30,SG,27,6-5,205,Boston University,
3,R.J. Hunter,Boston Celtics,28,SG,22,6-5,185,Georgia State,1.14864e+06
4,Jonas Jerebko,Boston Celtics,8,PF,29,6-10,231,,5e+06
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41,PF,20,6-10,234,Kentucky,2.2398e+06
453,Shelvin Mack,Utah Jazz,8,PG,26,6-3,203,Butler,2.43333e+06
454,Raul Neto,Utah Jazz,25,PG,24,6-1,179,,900000
455,Tibor Pleiss,Utah Jazz,21,C,26,7-3,256,,2.9e+06


# Função Melt()

In [25]:
df = pd.DataFrame({'a': {0: 'a', 1:'b', 2:'c'},
                  'b': {0:1, 1:3, 2:5},
                  'c': {0:2, 1:4, 2:6}})

In [26]:
df

Unnamed: 0,a,b,c
0,a,1,2
1,b,3,4
2,c,5,6


In [27]:
pd.melt(df, id_vars=['a'], value_vars=['b'])

Unnamed: 0,a,variable,value
0,a,b,1
1,b,b,3
2,c,b,5


In [28]:
pd.melt(df, id_vars=['a'], value_vars=['b', 'c'])

Unnamed: 0,a,variable,value
0,a,b,1
1,b,b,3
2,c,b,5
3,a,c,2
4,b,c,4
5,c,c,6


In [29]:
# alterando as variaveis

pd.melt(df, id_vars=['a'], value_vars=['b', 'c'], var_name='varteste', value_name='nome do valor')

Unnamed: 0,a,varteste,nome do valor
0,a,b,1
1,b,b,3
2,c,b,5
3,a,c,2
4,b,c,4
5,c,c,6


In [30]:
data = {
    'localizacao': ['cidadea', 'cidadeb'],
    'temperatura': ['prevista', 'atual'],
    'set-2019': [30,32],
    'out-2019': [45,43],
    'nov-2019': [24,22]
}

print(data)

{'localizacao': ['cidadea', 'cidadeb'], 'temperatura': ['prevista', 'atual'], 'set-2019': [30, 32], 'out-2019': [45, 43], 'nov-2019': [24, 22]}


In [32]:
# fazendo a transformação de um dicionário em um dataframe

df= pd.DataFrame(data, columns=['localizacao', 'temperatura', 'set-2019', 'out-2019', 'nov-2019'])

print(df)

  localizacao temperatura  set-2019  out-2019  nov-2019
0     cidadea    prevista        30        45        24
1     cidadeb       atual        32        43        22


In [33]:
# podemos usar a função melt para fazer um reshape para melhorar/facilitar a visualização

df2= pd.melt(df, id_vars=['localizacao', 'temperatura'], var_name='date', value_name='value')

print(df2)

  localizacao temperatura      date  value
0     cidadea    prevista  set-2019     30
1     cidadeb       atual  set-2019     32
2     cidadea    prevista  out-2019     45
3     cidadeb       atual  out-2019     43
4     cidadea    prevista  nov-2019     24
5     cidadeb       atual  nov-2019     22
