In [1]:
import pandas as pd
from sklearn import datasets
from tidyframe import gather, spread

+ Data preparation in this tutorial

In [2]:
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns=iris.feature_names)
df['target'] = iris.target
df['target2'] = list(map(lambda x: 1 if x<1 else 0, df.target))
col_gather = df.columns[:4]
print (col_gather)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


# Tutorial of **gather** Function

In [3]:
df_short = df.head()

In [4]:
df_short

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target2
0,5.1,3.5,1.4,0.2,0,1
1,4.9,3.0,1.4,0.2,0,1
2,4.7,3.2,1.3,0.2,0,1
3,4.6,3.1,1.5,0.2,0,1
4,5.0,3.6,1.4,0.2,0,1


+ Select want to change key-value pair column and change columns in **col_gather** to key-value pair

In [5]:
gather(df_short[col_gather].reset_index().head(8), col_gather ).head()

Unnamed: 0,index,key,value
0,0,sepal length (cm),5.1
1,0,sepal width (cm),3.5
2,0,petal length (cm),1.4
3,0,petal width (cm),0.2
4,1,sepal length (cm),4.9


+ Change all column to key-value pair and you can use index column to identify record

In [6]:
df_short_gather = gather(df_short[col_gather])
df_short_gather.head()

Unnamed: 0,index,key,value
0,0,sepal length (cm),5.1
1,0,sepal width (cm),3.5
2,0,petal length (cm),1.4
3,0,petal width (cm),0.2
4,1,sepal length (cm),4.9


+ Change column in **col_gather** to key-value pair and other columns will repeat.  
  so we will use other column to identify records in raw DataFrame

In [7]:
df_short_gather2 = gather(df_short, col_gather )
df_short_gather2.head()

Unnamed: 0,target,target2,key,value
0,0,1,sepal length (cm),5.1
1,0,1,sepal width (cm),3.5
2,0,1,petal length (cm),1.4
3,0,1,petal width (cm),0.2
4,0,1,sepal length (cm),4.9


+ If DataFrame with index name, then result of **gather** function will keep index name in raw DataFrame

In [8]:
df_short2 = df_short[col_gather]
df_short2.index.name = 'index_with_name'

In [9]:
df_short2

Unnamed: 0_level_0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
index_with_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [10]:
assert gather(df_short2, col_gather ).columns[0] == df_short2.index.name

# Tutorial of **spread** Function

If raw DataFrame is key-value pair DataFrame, result of spread function is a DataFrame with index is index column, key column transform to column, value column change to DataFrame value.

In [11]:
df_short_gather.head(8)

Unnamed: 0,index,key,value
0,0,sepal length (cm),5.1
1,0,sepal width (cm),3.5
2,0,petal length (cm),1.4
3,0,petal width (cm),0.2
4,1,sepal length (cm),4.9
5,1,sepal width (cm),3.0
6,1,petal length (cm),1.4
7,1,petal width (cm),0.2


+ Simple Example

In [12]:
spread(df_short_gather, ['index'], 'key')

Unnamed: 0_level_0,value,value,value,value
key,petal length (cm),petal width (cm),sepal length (cm),sepal width (cm)
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1.4,0.2,5.1,3.5
1,1.4,0.2,4.9,3.0
2,1.3,0.2,4.7,3.2
3,1.5,0.2,4.6,3.1
4,1.4,0.2,5.0,3.6


+ Assign inedx column and key column

In [13]:
spread(df_short_gather, 'index', 'key')

Unnamed: 0_level_0,value,value,value,value
key,petal length (cm),petal width (cm),sepal length (cm),sepal width (cm)
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1.4,0.2,5.1,3.5
1,1.4,0.2,4.9,3.0
2,1.3,0.2,4.7,3.2
3,1.5,0.2,4.6,3.1
4,1.4,0.2,5.0,3.6


+ Spread function support one key with mulitple values

In [14]:
df_short_gather2 = df_short_gather
df_short_gather2['value2'] = df_short_gather2['value'] + 1

In [15]:
df_short_gather2.head()

Unnamed: 0,index,key,value,value2
0,0,sepal length (cm),5.1,6.1
1,0,sepal width (cm),3.5,4.5
2,0,petal length (cm),1.4,2.4
3,0,petal width (cm),0.2,1.2
4,1,sepal length (cm),4.9,5.9
