In [1]:
import pandas as pd
import petl as etl
from collections import OrderedDict

https://www.kaggle.com/spscientist/students-performance-in-exams

In [2]:
data = pd.read_csv('./datasets/StudentsPerformance.csv')

In [3]:
students_performance = etl.fromdataframe(data)

In [4]:
students_performance.display()

gender,group,parental level of education,testpreparation,math_score,reading_score,writing_score
female,group B,bachelor's degree,none,72,72,74
female,group C,some college,completed,69,90,88
female,group B,master's degree,none,90,95,93
male,group A,associate's degree,none,47,57,44
male,group C,some college,none,76,78,75


#### Transform a table, mapping fields arbitrarily between input and output

In [5]:
mappings = OrderedDict()

rename a field

In [6]:
mappings['grade_group'] = 'group'

translate a field

In [7]:
mappings['sex'] = 'gender', {'male': 'M', 'female': 'F'}

apply a calculation to a field

In [8]:
mappings['reading_score'] = 'reading_score', lambda v: v / 2

apply a calculation to a combination of fields

In [9]:
mappings['total_score'] = lambda rec: rec['math_score'] + (rec['reading_score'])/2 + rec['writing_score']

In [10]:
students__updated_reading_scores = etl.fieldmap(students_performance, mappings)

In [11]:
students__updated_reading_scores.display()

grade_group,sex,reading_score,total_score
group B,F,36.0,182.0
group C,F,45.0,202.0
group B,F,47.5,230.5
group A,M,28.5,119.5
group C,M,39.0,190.0


#### Transform rows via an arbitrary function

In [12]:
def rowmapper(row):
    abbreviation = {'male': 'M', 'female': 'F'}
 
    return[abbreviation[row['gender']] if row['gender'] in abbreviation else None,
           row[1],
           row.reading_score / 2,
          ]

In [13]:
updated_scores = etl.rowmap(students_performance, rowmapper,
                            header=[ 'sex', 'group', 'updated_reading_score'])

In [14]:
updated_scores

sex,group,updated_reading_score
F,group B,36.0
F,group C,45.0
F,group B,47.5
M,group A,28.5
M,group C,39.0


#### Map each input row to any number of output rows via an arbitrary function

In [15]:
def rowgenerator(row):
    transform = {'group A': 'A', 'group B': 'B', 'group C' : 'C', 'group D' : 'D'}
    
    yield ['group',
           transform[row['group']] if row['group'] in transform else None]
    
    yield ['total', row.math_score + row.reading_score + row.writing_score]

In [16]:
total_scores = etl.rowmapmany(students_performance, rowgenerator,
                              header=['variable', 'value'])

In [17]:
total_scores.display(10)

variable,value
group,B
total,218
group,C
total,247
group,B
total,278
group,A
total,148
group,C
total,229


In [18]:
students_performance.display()

gender,group,parental level of education,testpreparation,math_score,reading_score,writing_score
female,group B,bachelor's degree,none,72,72,74
female,group C,some college,completed,69,90,88
female,group B,master's degree,none,90,95,93
male,group A,associate's degree,none,47,57,44
male,group C,some college,none,76,78,75


### SORTING

#### The default is a lexical sort
The sort is first applied on the gender column, then group, then parental level of education etc.


In [19]:
sort_table = etl.sort(students_performance)

In [20]:
sort_table.display()

gender,group,parental level of education,testpreparation,math_score,reading_score,writing_score
female,group A,associate's degree,completed,55,65,62
female,group A,associate's degree,completed,65,70,74
female,group A,associate's degree,none,37,57,56
female,group A,associate's degree,none,41,51,48
female,group A,associate's degree,none,65,85,76


In [21]:
sorted_group = etl.sort(students_performance, 'group')

In [22]:
sorted_group.display(10)

gender,group,parental level of education,testpreparation,math_score,reading_score,writing_score
male,group A,associate's degree,none,47,57,44
male,group A,some college,completed,78,72,70
female,group A,master's degree,none,50,53,58
male,group A,master's degree,none,73,74,72
female,group A,associate's degree,completed,55,65,62
male,group A,some high school,none,39,39,34
male,group A,associate's degree,none,62,61,55
female,group A,associate's degree,none,41,51,48
male,group A,bachelor's degree,completed,80,78,81
male,group A,some college,completed,50,47,54


 assumes the input table is already sorted by the given key

#### The rowgroupby groups all rows based on a particular field
- All rows with the same value of 'group' will be aggregated
- This creates a generator object

In [25]:
etl.rowgroupby(sorted_group, 'group')

<generator object rowgroupby.<locals>.<genexpr> at 0x11f49c820>

In [27]:
for key, rows in etl.rowgroupby(sorted_group, 'group'):
      print(key)

group A
group B
group C
group D
group E


In [29]:
group_to_rows_dict = {}

In [32]:
for key, rows in etl.rowgroupby(sorted_group, 'group'):
      group_to_rows_dict[key] = list(rows)

In [38]:
group_to_rows_dict['group A']

[('male', 'group A', "associate's degree", 'none', 47, 57, 44),
 ('male', 'group A', 'some college', 'completed', 78, 72, 70),
 ('female', 'group A', "master's degree", 'none', 50, 53, 58),
 ('male', 'group A', "master's degree", 'none', 73, 74, 72),
 ('female', 'group A', "associate's degree", 'completed', 55, 65, 62),
 ('male', 'group A', 'some high school', 'none', 39, 39, 34),
 ('male', 'group A', "associate's degree", 'none', 62, 61, 55),
 ('female', 'group A', "associate's degree", 'none', 41, 51, 48),
 ('male', 'group A', "bachelor's degree", 'completed', 80, 78, 81),
 ('male', 'group A', 'some college', 'completed', 50, 47, 54),
 ('female', 'group A', 'some college', 'none', 58, 70, 67),
 ('male', 'group A', "associate's degree", 'none', 54, 53, 47),
 ('female', 'group A', "bachelor's degree", 'none', 51, 49, 51),
 ('male', 'group A', 'high school', 'none', 57, 43, 47),
 ('male', 'group A', 'some high school', 'completed', 62, 67, 69),
 ('male', 'group A', "bachelor's degree", 

In [39]:
for key, group in etl.rowgroupby(sorted_group, 'group', 'reading_score'):
    print(key, '\n', list(group), '\n\n')

group A 
 [57, 72, 53, 74, 65, 39, 61, 51, 78, 47, 70, 53, 49, 43, 67, 67, 73, 72, 49, 41, 78, 67, 23, 61, 58, 64, 83, 64, 43, 45, 43, 65, 85, 59, 59, 73, 82, 58, 67, 88, 52, 82, 85, 70, 50, 92, 100, 96, 55, 51, 73, 66, 62, 54, 93, 96, 74, 80, 70, 58, 72, 84, 72, 46, 57, 60, 79, 81, 31, 47, 59, 90, 60, 60, 82, 68, 67, 48, 84, 57, 62, 58, 68, 50, 63, 87, 51, 45, 63] 


group B 
 [72, 95, 83, 95, 43, 60, 81, 32, 75, 54, 65, 56, 58, 65, 54, 64, 58, 41, 49, 45, 86, 66, 67, 44, 76, 64, 85, 89, 95, 68, 54, 84, 70, 60, 86, 78, 53, 76, 83, 54, 70, 52, 64, 53, 79, 69, 81, 61, 83, 77, 56, 85, 66, 86, 52, 70, 64, 70, 75, 85, 56, 46, 82, 89, 75, 62, 76, 71, 76, 60, 71, 55, 78, 68, 38, 76, 61, 65, 72, 62, 84, 86, 67, 63, 65, 80, 67, 39, 83, 46, 73, 65, 54, 90, 71, 52, 53, 64, 69, 87, 82, 59, 61, 97, 78, 69, 90, 48, 70, 51, 43, 48, 75, 91, 56, 24, 63, 63, 70, 60, 74, 80, 81, 65, 81, 53, 79, 67, 85, 64, 68, 55, 66, 67, 63, 70, 94, 90, 43, 77, 66, 55, 84, 77, 62, 72, 62, 48, 67, 65, 58, 72, 90, 85, 51

#### sorting by compound key is supported

In [42]:
sorted_group_mathscore = etl.sort(students_performance, 
                                  key = ['group', 'math_score'])

sorted_group_mathscore.display(10)

gender,group,parental level of education,testpreparation,math_score,reading_score,writing_score
male,group A,some college,none,28,23,19
female,group A,high school,completed,34,48,41
female,group A,associate's degree,none,37,57,56
female,group A,some high school,none,38,43,43
male,group A,some high school,none,39,39,34
male,group A,associate's degree,completed,40,55,53
female,group A,associate's degree,none,41,51,48
female,group A,some high school,none,44,64,58
female,group A,some high school,none,44,45,45
male,group A,high school,none,45,47,49


#### Set reverse = True for a descending order sort

In [43]:
sorted_group_mathscore = etl.sort(students_performance, 
                                  key = ['group', 'math_score'],
                                  reverse = True)

sorted_group_mathscore.display(10)

gender,group,parental level of education,testpreparation,math_score,reading_score,writing_score
male,group E,associate's degree,completed,100,100,93
female,group E,some college,none,100,92,97
female,group E,bachelor's degree,none,100,100,100
male,group E,bachelor's degree,completed,100,100,100
female,group E,associate's degree,none,100,100,100
female,group E,bachelor's degree,completed,99,100,100
female,group E,high school,none,99,93,90
male,group E,some college,completed,99,87,81
male,group E,some college,none,97,87,82
male,group E,associate's degree,completed,97,82,88


### Merge Sort
#### Combine multiple input tables into one sorted output table

In [44]:
group_A = [['name', 'gender', 'test_prepration', 'extracurricular', 'GPA'],
           ['Mark', 'male', 'completed', 'art club', 3.4],
           ['Gary', 'male', 'none', 'drama club', 2.98],
           ['Amanda', 'female', 'none', 'photography club', 3],
           ['Faye', 'female', 'completed', 'none', 3.7]]

In [45]:
group_B = [['name', 'gender', 'test_prepration', 'extracurricular', 'GPA'],
           ['Libby', 'female', 'completed', 'none', 3.27],
           ['Sam', 'male', 'completed', 'drama club', 3.88],
           ['Barbara', 'female', 'completed', 'film club', 2.7],
           ['Anne', 'female', 'none', 'science club', 2.67]]

In [46]:
etl.issorted(group_A, key = 'name')

False

In [47]:
etl.issorted(group_B, key = 'name')

False

In [48]:
merge_sort = etl.mergesort(group_A, group_B, key = 'name')

In [49]:
merge_sort.display()

name,gender,test_prepration,extracurricular,GPA
Amanda,female,none,photography club,3.0
Anne,female,none,science club,2.67
Barbara,female,completed,film club,2.7
Faye,female,completed,none,3.7
Gary,male,none,drama club,2.98


In [50]:
etl.issorted(merge_sort, key = 'name')

True

In [51]:
sorted_GPA = etl.mergesort(group_A, 
                           group_B, 
                           key = 'GPA', 
                           reverse = True,
                           header = ['name', 'GPA', 'test_prepration'])

In [52]:
sorted_GPA.displayall()

name,GPA,test_prepration
Sam,3.88,completed
Faye,3.7,completed
Mark,3.4,completed
Libby,3.27,completed
Amanda,3.0,none
Gary,2.98,none
Barbara,2.7,completed
Anne,2.67,none


In [53]:
etl.issorted(sorted_GPA, key='GPA')

False

In [54]:
etl.issorted(sorted_GPA, key='GPA', reverse = True)

True