In [42]:
import pandas as pd
import os

In [43]:
files = [
    '../wiki-corpus/revisiontag.csv',
    '../wiki-corpus/created/revisiontag_created_1k.csv',
    '../wiki-corpus/deleted/revisiontag_deleted_1k.csv',
    '../wiki-corpus/modified/revisiontag_modified_1k.csv',
    '../wiki-corpus/modified/revisiontag_modified_total.csv',
    '../wiki-corpus/modified/revisiontag_modified_vandalism_filter.csv'
]

In [44]:
def change_revision_tag(row):
    revision_to_tag = {
        634787607: (4, 5),
        541063202: (4, 2),
        722050938: (4, 3),
        303194983: (4, 2),
        173298304: (4, 2),
        282894309: (4, 2),
        553850696: (4, 2),
        186730349: (4, 3),
        36929129: (4, 2),
        308633558: (4, 2),
        183711405: (4, 3),
        74229710: (4, 5),
        191148479: (4, 2),
        565119644: (4, 3),
        416396896: (4, 2),
        380978900: (4, 3),
        193291165: (4, 2),
        161511632: (4, 5),
        112325689: (4, 2),
        453789687: (4, 5),
        489545491: (4, 2),
        323168532: (4, 3),
        358262551: (4, 2),
        518597494: (4, 2),
        47773679: (4, 3),
        476730971: (4, 2),
        571913925: (4, 2),
        348422444: (4, 2),
        287085560: (4, 2),
        546729113: (4, 2)
    }
    
    if row['revision_id'] in revision_to_tag:
        tag_mapping = revision_to_tag[row['revision_id']]
        if row['tag_id'] == tag_mapping[0]:
            before = row['tag_id']
            row['tag_id'] = tag_mapping[1]
            print('4. Revision', row['revision_id'], 'From', before, 'To', row['tag_id'])
    
    return row



def remove(df, revision_id, tag_id):
    print('Drop revision', revision_id, 'with tag', tag_id)
    return df[~((df.revision_id == revision_id) & (df.tag_id == tag_id))]
    
    
def concurrent_tags_for_revisions_having_tag(df, tag_id):
    constructive_revision_ids = df[df['tag_id'] == tag_id]['revision_id']
    constructive_revisions = df[df['revision_id'].isin(constructive_revision_ids)]
    return constructive_revisions[constructive_revisions['tag_id'] != tag_id]
    

In [46]:
for file in files:
    print('Converting', file)
    df = pd.read_csv(file)
    
    # Filters
    # Tag 7: Merge conflict will be dropped completely (only a single edit)
    print('1. Removing single edit with merge conflict tag')
    print('1. Before: Num. merge conflicts', df[df.tag_id == 7]['revision_id'].count())
    df = df[df.tag_id != 7]
    print('1. After: Num. merge conflicts', df[df.tag_id == 7]['revision_id'].count())
    
    print('2. Add template to syntax category')
    print('2. Before: Num. template', df[df.tag_id == 8]['revision_id'].count())
    df['tag_id'] = df['tag_id'].map(lambda x: 6 if x == 8 else x)
    print('2. After: Num. template', df[df.tag_id == 8]['revision_id'].count())
    
    print('3. SEO to quailty issue')
    print('3. Before: Num. SEO', df[df.tag_id == 10]['revision_id'].count())
    df['tag_id'] = df['tag_id'].map(lambda x: 3 if x == 10 else x)
    print('3. After: Num. SEO', df[df.tag_id == 10]['revision_id'].count())
    
    print('4. Personal story review')
    print('4. Before: Num. personal stories', df[df.tag_id == 4]['revision_id'].count())
    df.apply(change_revision_tag, axis=1)
    print('4. After: Num. personal stories', df[df.tag_id == 4]['revision_id'].count())
    
    print('5. Drop duplicates')
    print('5. Before: Num. revisions', df['revision_id'].count())
    df = df.drop_duplicates()
    print('5. After: Num. revisions', df['revision_id'].count())

    print('6. Sanity checks')
    print('6.1. Before: Constructive revisions have no other tags', df['revision_id'].count())
    additional_tags_to_constructive = concurrent_tags_for_revisions_having_tag(df, 12)
    print(additional_tags_to_constructive)

    df = remove(df, 86750105, 9)
    df = remove(df, 254289420, 12)
    df = remove(df, 335837031, 12)
    df = remove(df, 346627086, 12)
    df = remove(df, 355359368, 12)
    df = remove(df, 803735776, 12)
    df = remove(df, 605648773, 1)
    df = remove(df, 607936709, 12)
    df = remove(df, 65859660, 12)
    df = remove(df, 4933285, 1)
    df = remove(df, 336261331, 1)
    df = remove(df, 731347235, 9)
    df = remove(df, 614842615, 11)
    df = remove(df, 594741263, 12)
    df = remove(df, 64801608, 12)
    df = remove(df, 183071625, 12)
    df = remove(df, 667008025, 12)
    df = remove(df, 302288184, 12)
    df = remove(df, 254289420, 12)
    df = remove(df, 628321938, 12)
    df = remove(df, 797350226, 12)
    df = remove(df, 666917549, 1)
    df = remove(df, 310038235, 12)
    df = remove(df, 233691399, 12)
    df = remove(df, 666961416, 12)
    df = remove(df, 447669434, 1)
    df = remove(df, 155930510, 12)
    df = remove(df, 614543886, 9)
    df = remove(df, 237847062, 12)
    df = remove(df, 779897030, 12)
    df = remove(df, 625468820, 1)
    df = remove(df, 383285785, 12)
    df = remove(df, 547806361, 12)
    df = remove(df, 212809622, 12)
    df = remove(df, 635430902, 12)
    df = remove(df, 576533046, 12)
    df = remove(df, 614842615, 12)
    df = remove(df, 609025214, 12)
    df = remove(df, 664058166, 12)
    df = remove(df, 457957436, 12)
    df = remove(df, 646098072, 12)
    df = remove(df, 486170837, 6)
    
          
    print('6.1. After: Constructive revisions have no other tags', df['revision_id'].count())
    additional_tags_to_constructive = concurrent_tags_for_revisions_having_tag(df, 12)
    print(additional_tags_to_constructive)
    
    print('6.2. Intention tags are always vandalism', df['revision_id'].count())
    additional_tags_to_intention = concurrent_tags_for_revisions_having_tag(df, 11)
    print(additional_tags_to_intention[additional_tags_to_intention['tag_id'] == 12])
    
    print('6.3. Verify total number of revisions after modification')
    print('6.3. After: Num. revisions', df['revision_id'].count())
    
    file_wo_extension = os.path.splitext(file)[0]
    output_file = file_wo_extension + '-v2.csv'
    print('Saving to file', output_file)
    df.to_csv(output_file, index=False)
    print()
    
    
    

Converting ../wiki-corpus/revisiontag.csv
1. Removing single edit with merge conflict tag
1. Before: Num. merge conflicts 1
1. After: Num. merge conflicts 0
2. Add template to syntax category
2. Before: Num. template 23
2. After: Num. template 0
3. SEO to quailty issue
3. Before: Num. SEO 17
3. After: Num. SEO 0
4. Personal story review
4. Before: Num. personal stories 30
4. Revision 634787607 From 4 To 5
4. Revision 541063202 From 4 To 2
4. Revision 722050938 From 4 To 3
4. Revision 303194983 From 4 To 2
4. Revision 173298304 From 4 To 2
4. Revision 282894309 From 4 To 2
4. Revision 553850696 From 4 To 2
4. Revision 186730349 From 4 To 3
4. Revision 36929129 From 4 To 2
4. Revision 308633558 From 4 To 2
4. Revision 183711405 From 4 To 3
4. Revision 74229710 From 4 To 5
4. Revision 191148479 From 4 To 2
4. Revision 565119644 From 4 To 3
4. Revision 416396896 From 4 To 2
4. Revision 380978900 From 4 To 3
4. Revision 193291165 From 4 To 2
4. Revision 161511632 From 4 To 5
4. Revision 112

4. Revision 47773679 From 4 To 3
4. Revision 348422444 From 4 To 2
4. After: Num. personal stories 0
5. Drop duplicates
5. Before: Num. revisions 1513
5. After: Num. revisions 1510
6. Sanity checks
6.1. Before: Constructive revisions have no other tags 1510
      revision_id  revision_page_id  tag_id
9       605648773           1944281       1
19      607936709            596566       1
153      65859660            904166       1
174       4933285            494596       1
196     336261331             27439       1
279     731347235          12767523       9
305     614842615          39922676      11
340      64801608           5467649       6
413     183071625             52684      11
441     183071625             52684       2
468     667008025          11573481       1
469     302288184          22605309       1
514     628321938             80777       3
555     797350226           9612684      11
579     666917549          28203294       1
681     310038235             23503   

4. Revision 282894309 From 4 To 2
4. Revision 287085560 From 4 To 2
4. Revision 303194983 From 4 To 2
4. Revision 308633558 From 4 To 2
4. Revision 323168532 From 4 To 3
4. Revision 358262551 From 4 To 2
4. Revision 380978900 From 4 To 3
4. Revision 416396896 From 4 To 2
4. Revision 453789687 From 4 To 5
4. Revision 476730971 From 4 To 2
4. Revision 489545491 From 4 To 2
4. Revision 518597494 From 4 To 2
4. Revision 541063202 From 4 To 2
4. Revision 546729113 From 4 To 2
4. Revision 553850696 From 4 To 2
4. Revision 565119644 From 4 To 3
4. Revision 571913925 From 4 To 2
4. Revision 634787607 From 4 To 5
4. Revision 722050938 From 4 To 3
4. After: Num. personal stories 0
5. Drop duplicates
5. Before: Num. revisions 2671
5. After: Num. revisions 2662
6. Sanity checks
6.1. Before: Constructive revisions have no other tags 2662
      revision_id  revision_page_id  tag_id
107      86750105             31730       9
684     254289420            605581       1
993     335837031          2536