In [1]:
import pandas as pd 
import petl as etl

In [2]:
employees_finance_info = [['name', 'designation', 'emp_id', 'rating'],
                          ['Alice ', 'manager', 23, 9],
                          ['Alvin', 'team lead', 18, 7],
                          ['Evan', 'executive', 56, 5],
                          ['Ben', 'research analyst', 78, 9],
                          ['Bill', 'associate', 3884, 4]]

In [3]:
employees_analytics_info = [['name', 'designation', 'emp_id', 'rating'],
                            ['Lance ', 'business analyst', 32, 8],
                            ['Ellen', 'junior data scientist', 546, 9],
                            ['Ernie', 'senior data scientist', 7763, 6],
                            ['Geoff', 'data analyst', 998, 7]]

#### Choose and/or re-order fields

We also explore the display, look and see methods which are different ways to view table data

In [10]:
employees_name = etl.cut(employees_finance_info, 'name', 'emp_id')

In [11]:
employees_name.display()

name,emp_id
Alice,23
Alvin,18
Evan,56
Ben,78
Bill,3884


In [12]:
employee_designation = etl.cut(employees_finance_info, 0, 1)

In [13]:
employee_designation.look()

+----------+--------------------+
| name     | designation        |
| 'Alice ' | 'manager'          |
+----------+--------------------+
| 'Alvin'  | 'team lead'        |
+----------+--------------------+
| 'Evan'   | 'executive'        |
+----------+--------------------+
| 'Ben'    | 'research analyst' |
+----------+--------------------+
| 'Bill'   | 'associate'        |
+----------+--------------------+

In [14]:
analytics_emp_ratings = etl.cut(employees_analytics_info, 0 , 'rating')

In [15]:
analytics_emp_ratings.see()

name: 'Lance ', 'Ellen', 'Ernie', 'Geoff'
rating: 8, 9, 6, 7

select a range of fields

In [16]:
analytics_emp_designation = etl.cut(employees_analytics_info, *range(0, 2))

analytics_emp_designation

name,designation
Lance,business analyst
Ellen,junior data scientist
Ernie,senior data scientist
Geoff,data analyst


In [17]:
etl.cut(employees_analytics_info, *range(1, 4))

designation,emp_id,rating
business analyst,32,8
junior data scientist,546,9
senior data scientist,7763,6
data analyst,998,7


#### Make sure the range specified is not out of bounds
This will throw an error

In [18]:
etl.cut(employees_analytics_info, *range(1, 7))

FieldSelectionError: selection is not a field or valid field index: 4

FieldSelectionError: selection is not a field or valid field index: 4

In [19]:
emp_designation_id = etl.cutout(employees_analytics_info, 'rating')

emp_designation_id

name,designation,emp_id
Lance,business analyst,32
Ellen,junior data scientist,546
Ernie,senior data scientist,7763
Geoff,data analyst,998


#### Vertical concatenation
The annex method joins two or more tables by "row order" i.e. vertical concatenation

In [25]:
emp_full_info = etl.annex(emp_designation_id, analytics_emp_ratings)

emp_full_info

name,designation,emp_id,name.1,rating
Lance,business analyst,32,Lance,8
Ellen,junior data scientist,546,Ellen,9
Ernie,senior data scientist,7763,Ernie,6
Geoff,data analyst,998,Geoff,7


#### Concatenate tables horizontally

In [26]:
employees_finance_info

[['name', 'designation', 'emp_id', 'rating'],
 ['Alice ', 'manager', 23, 9],
 ['Alvin', 'team lead', 18, 7],
 ['Evan', 'executive', 56, 5],
 ['Ben', 'research analyst', 78, 9],
 ['Bill', 'associate', 3884, 4]]

In [27]:
employees_analytics_info

[['name', 'designation', 'emp_id', 'rating'],
 ['Lance ', 'business analyst', 32, 8],
 ['Ellen', 'junior data scientist', 546, 9],
 ['Ernie', 'senior data scientist', 7763, 6],
 ['Geoff', 'data analyst', 998, 7]]

In [28]:
employees_info = etl.cat(employees_finance_info, employees_analytics_info)

In [29]:
employees_info.displayall()

name,designation,emp_id,rating
Alice,manager,23,9
Alvin,team lead,18,7
Evan,executive,56,5
Ben,research analyst,78,9
Bill,associate,3884,4
Lance,business analyst,32,8
Ellen,junior data scientist,546,9
Ernie,senior data scientist,7763,6
Geoff,data analyst,998,7


#### Re-order the columns of a table

In [30]:
employees_info = etl.movefield(employees_info, 'emp_id', 0)

In [31]:
employees_info

emp_id,name,designation,rating
23,Alice,manager,9
18,Alvin,team lead,7
56,Evan,executive,5
78,Ben,research analyst,9
3884,Bill,associate,4


In [32]:
new_employees = [['name', 'department'],
                 ['Dorothy ', 'operations'],
                 ['Libby', 'sales'],
                 ['Gary', 'marketing']]

#### Concatenate the table with the array
- Notice how the first row of the 2-D array is used as the column headers and the values are assigned to the corresponding columns
- The order of tables supplied to the cat method determines the order in the concatenated table

In [33]:
employees_info_new = etl.cat(employees_info, new_employees)

In [34]:
employees_info_new.displayall()

emp_id,name,designation,rating,department
23.0,Alice,manager,9.0,
18.0,Alvin,team lead,7.0,
56.0,Evan,executive,5.0,
78.0,Ben,research analyst,9.0,
3884.0,Bill,associate,4.0,
32.0,Lance,business analyst,8.0,
546.0,Ellen,junior data scientist,9.0,
7763.0,Ernie,senior data scientist,6.0,
998.0,Geoff,data analyst,7.0,
,Dorothy,,,operations


#### Concatenating with a table which includes a different column
The city column is only present in more_employees. The value gets set to None for all other employees in the other table

In [40]:
more_employees = [['name', 'city'],
                 ['Russel', 'Baltimore'],
                 ['Walter', 'Albuquerque']]

In [42]:
employees_info_new = etl.cat(employees_info, new_employees, more_employees)

employees_info_new.displayall()

emp_id,name,designation,rating,department,city
23.0,Alice,manager,9.0,,
18.0,Alvin,team lead,7.0,,
56.0,Evan,executive,5.0,,
78.0,Ben,research analyst,9.0,,
3884.0,Bill,associate,4.0,,
32.0,Lance,business analyst,8.0,,
546.0,Ellen,junior data scientist,9.0,,
7763.0,Ernie,senior data scientist,6.0,,
998.0,Geoff,data analyst,7.0,,
,Dorothy,,,operations,


using the header keyword argument with two input tables

In [44]:
employees_info_new = etl.cat(employees_info, 
                             new_employees,
                             more_employees,
                             header = ['name', 'designation','department'])

In [45]:
employees_info_new.displayall()

name,designation,department
Alice,manager,
Alvin,team lead,
Evan,executive,
Ben,research analyst,
Bill,associate,
Lance,business analyst,
Ellen,junior data scientist,
Ernie,senior data scientist,
Geoff,data analyst,
Dorothy,,operations


#### Concatenate tables, without trying to match headers.

In [51]:
employees_analytics_info

[['name', 'designation', 'emp_id', 'rating'],
 ['Lance ', 'business analyst', 32, 8],
 ['Ellen', 'junior data scientist', 546, 9],
 ['Ernie', 'senior data scientist', 7763, 6],
 ['Geoff', 'data analyst', 998, 7]]

In [52]:
employees_operations_info = [['emp_name', 'position', 'id', 'rating'],
                             ['Heather ', 'manager', 4322, 6],
                             ['Mike', 'assistant manager', 674, 7],
                             ['Grace', 'supervisor', 873, 9],
                             ['Sam', 'program manager', 727, 8]
                            ]

In [53]:
emp_analytics_ops = etl.stack(employees_analytics_info, employees_operations_info)

#### The column headers from the first array are used

In [54]:
emp_analytics_ops.displayall()

name,designation,emp_id,rating
Lance,business analyst,32,8
Ellen,junior data scientist,546,9
Ernie,senior data scientist,7763,6
Geoff,data analyst,998,7
Heather,manager,4322,6
Mike,assistant manager,674,7
Grace,supervisor,873,9
Sam,program manager,727,8


#### When using the stack method, the ordering of fields becomes important
With the HR employees array, the id and position columns are flipped. The combined table does not contain the correct values in those columns

In [55]:
employees_hr_info = [['emp_name', 'id', 'position', 'rating'],
                     ['Beatrice', 945, 'supervisor', 8],
                     ['Spyros', 79, 'recruiter', 9]
                    ]

In [56]:
emp_all = etl.stack(employees_analytics_info, 
                    employees_operations_info, 
                    employees_hr_info)

emp_all.displayall()

name,designation,emp_id,rating
Lance,business analyst,32,8
Ellen,junior data scientist,546,9
Ernie,senior data scientist,7763,6
Geoff,data analyst,998,7
Heather,manager,4322,6
Mike,assistant manager,674,7
Grace,supervisor,873,9
Sam,program manager,727,8
Beatrice,945,supervisor,8
Spyros,79,recruiter,9


#### Add a field with a fixed or calculated value.

In [61]:
employees_analytics_info

[['name', 'designation', 'emp_id', 'rating'],
 ['Lance ', 'business analyst', 32, 8],
 ['Ellen', 'junior data scientist', 546, 9],
 ['Ernie', 'senior data scientist', 7763, 6],
 ['Geoff', 'data analyst', 998, 7]]

In [62]:
updated_employees_analytics_info = etl.addfield(employees_analytics_info, 
                                                'department', 
                                                'analytics')

In [63]:
updated_employees_analytics_info

name,designation,emp_id,rating,department
Lance,business analyst,32,8,analytics
Ellen,junior data scientist,546,9,analytics
Ernie,senior data scientist,7763,6,analytics
Geoff,data analyst,998,7,analytics


In [64]:
updated_employees_analytics_info = etl.addfield(employees_analytics_info,
                                                'bonus', lambda rec: rec['rating'] * 200)

In [65]:
updated_employees_analytics_info

name,designation,emp_id,rating,bonus
Lance,business analyst,32,8,1600
Ellen,junior data scientist,546,9,1800
Ernie,senior data scientist,7763,6,1200
Geoff,data analyst,998,7,1400


#### you can specify an index for the calculated or fixed fields
here we are making index for designatio as 2

In [66]:
employees_finance_info

[['name', 'designation', 'emp_id', 'rating'],
 ['Alice ', 'manager', 23, 9],
 ['Alvin', 'team lead', 18, 7],
 ['Evan', 'executive', 56, 5],
 ['Ben', 'research analyst', 78, 9],
 ['Bill', 'associate', 3884, 4]]

In [67]:
updated_employees_finance_info = etl.addfields(employees_finance_info,
                                               [('department', 'finance', 2),
                                                ('bonus', lambda rec: rec['rating'] * 250)])

In [68]:
updated_employees_finance_info

name,designation,department,emp_id,rating,bonus
Alice,manager,finance,23,9,2250
Alvin,team lead,finance,18,7,1750
Evan,executive,finance,56,5,1250
Ben,research analyst,finance,78,9,2250
Bill,associate,finance,3884,4,1000


#### Add a column of data to the table

In [69]:
gender = ['F', 'M', 'M', 'M', 'M']

In [70]:
updated_employees_finance_info = etl.addcolumn(updated_employees_finance_info, 
                                               'gender',
                                               gender, 
                                               index = 1 )

In [71]:
updated_employees_finance_info

name,gender,designation,department,emp_id,rating,bonus
Alice,F,manager,finance,23,9,2250
Alvin,M,team lead,finance,18,7,1750
Evan,M,executive,finance,56,5,1250
Ben,M,research analyst,finance,78,9,2250
Bill,M,associate,finance,3884,4,1000


#### Add a field of row numbers

In [72]:
updated_employees_finance_info = etl.addrownumbers(updated_employees_finance_info)

updated_employees_finance_info

row,name,gender,designation,department,emp_id,rating,bonus
1,Alice,F,manager,finance,23,9,2250
2,Alvin,M,team lead,finance,18,7,1750
3,Evan,M,executive,finance,56,5,1250
4,Ben,M,research analyst,finance,78,9,2250
5,Bill,M,associate,finance,3884,4,1000


#### Replace one or more values in the table’s header row

In [73]:
updated_employees_finance_info = etl.rename(updated_employees_finance_info, 'row', 's.no.')

updated_employees_finance_info

s.no.,name,gender,designation,department,emp_id,rating,bonus
1,Alice,F,manager,finance,23,9,2250
2,Alvin,M,team lead,finance,18,7,1750
3,Evan,M,executive,finance,56,5,1250
4,Ben,M,research analyst,finance,78,9,2250
5,Bill,M,associate,finance,3884,4,1000


In [74]:
updated_employees_analytics_info

name,designation,emp_id,rating,bonus
Lance,business analyst,32,8,1600
Ellen,junior data scientist,546,9,1800
Ernie,senior data scientist,7763,6,1200
Geoff,data analyst,998,7,1400


In [75]:
updated_employees_analytics_info = etl.rename(updated_employees_analytics_info,
                                              {'designation': 'job_position',
                                               'emp_id': 'id'})

In [76]:
updated_employees_analytics_info

name,job_position,id,rating,bonus
Lance,business analyst,32,8,1600
Ellen,junior data scientist,546,9,1800
Ernie,senior data scientist,7763,6,1200
Geoff,data analyst,998,7,1400
