# Alignment of Wikipedia Editors

## Workbench

In [None]:
import glob
import subprocess
import csv
import os
from collections import defaultdict, Counter
from multiprocessing import Pool
from itertools import groupby
from operator import itemgetter
import pickle

## Data cleaning

Remove duplicate pages in both liberal and conservative corpora

In [None]:
b=set([i.split('/')[-1] for i in glob.glob("Data/liberal_articles/*.7z")])
a=set([i.split('/')[-1] for i in glob.glob("Data/conservative_articles/*.7z")])

In [None]:
len(a&b), len(a&b)/len(b), len(a&b)/len(a)

In [None]:
for i in a&b:
    os.remove('Data/conservative_articles/'+i)
    os.remove('Data/liberal_articles/'+i)

## Extract number of bytes

In [2]:
def processFile(filename):
    subprocess.call(["7z", "e", filename], stdout=open(os.devnull, 'wb'))
    editor=defaultdict(int)
    infile=open(filename[:-3])
    prev = 0
    for line in reversed(infile.readlines()):
        row=line.split('|')
        try:
            curr = int(row[2].strip('()').split(' ')[0].replace(',',''))
            editor[row[1]]+=abs(curr-prev)
            prev = curr
        except:
            print (filename,row)
    infile.close()
    os.remove(filename[:-3])
    return Counter(editor)

def reducer(x, y):
    return x+y

### Liberal pages

In [None]:
pages=list(glob.glob("Data/liberal_articles/*.7z"))
pool=Pool(10)
res=pool.map(processFile,pages)
liberal_editor=reduce(reducer, res)
liberal_editor=pd.DataFrame(list(liberal_editor.items()),columns=['editor', 'liberal'])

### Conservative pages

In [None]:
pages=list(glob.glob("Data/conservative_articles/*.7z"))
pool=Pool(10)
res=pool.map(processFile,pages)
conservative_editor=reduce(reducer, res)
conservative_editor=pd.DataFrame(list(conservative_editor.items()),columns=['editor', 'conservative'])

## Alignment

In [None]:
df=pd.merge(liberal_editor,conservative_editor,how='outer',on='editor')

In [None]:
df.fillna(0,inplace=True)
df['political']=df.conservative+df.liberal

In [None]:
aveAlign=df['conservative'].sum()/df['political'].sum()
avePolitical=df['political'].mean()

In [None]:
df['alignment']=(df['conservative']+avePolitical*aveAlign)/(df['political']+avePolitical)
df.loc[df['alignment']<=aveAlign,'alignment']=df['alignment'][df['alignment']<=aveAlign]/aveAlign-1
df.loc[df['alignment']>aveAlign,'alignment']=(df['alignment'][df['alignment']>aveAlign]-aveAlign)/(1-aveAlign)

In [None]:
with open('Data/user_alignments.pkl','wb') as outfile:
    pickle.dump(dict(df[['editor','alignment']].values.tolist()),outfile)