# Data Wrangling

In [1]:
import glob
import os
import pandas as pd
import pickle
import tarfile
import numpy as np
import subprocess
from pandas.parser import CParserError
import sys
import csv
import scipy.sparse as ssp

  


## Article Pages

### Basic edit information

Combine the edit information extracted by parsers and generate a table where each row corresponds to an edit and 
there are 4 columns:
1. page title 
2. time
3. editor name
4. current page length

#### Liberal

In [77]:
with tarfile.open('liberal.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[]
for i in glob.glob('temp/*.7z'):
    subprocess.call(['7z','e',i,'-otemp'])
    try:
        df=pd.read_csv(i[:-3], sep='|', parse_dates=[0], names=['time','user','byte'],usecols=[0,1,2])
        a=pd.to_datetime(df['time'])
    except:
        subprocess.call("tr -d '\r' < "+ i[:-3]+ " > " + i[:-7], shell=True)
        df=pd.read_csv(i[:-7], sep='|', parse_dates=[0], names=['time','user','byte'],usecols=[0,1,2])
    df['byte']=pd.to_numeric(df['byte'].str.replace('bytes','').str.strip('() ').str.replace(',','').str.replace('empty','').str.replace('byte',''))
    df['title']=i.split('/')[-1][:-15].replace('_',' ')
    dfs.append(df)

df=pd.concat(dfs,ignore_index=True)

df.to_csv('Data/liberal_pages.tsv',sep='\t',index=False)

#### Conservative

In [88]:
with tarfile.open('conservative.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[]
for i in glob.glob('temp/*.7z'):
    subprocess.call(['7z','e',i,'-o..temp'])
    try:
        df=pd.read_csv(i[:-3], sep='|', parse_dates=[0], names=['time','user','byte'],usecols=[0,1,2])
        a=pd.to_datetime(df['time'])
    except:
        subprocess.call("tr -d '\r' < "+ i[:-3]+ " > " + i[:-7], shell=True)
        df=pd.read_csv(i[:-7], sep='|', parse_dates=[0], names=['time','user','byte'],usecols=[0,1,2])
    df['byte']=pd.to_numeric(df['byte'].str.replace('bytes','').str.strip('() ').str.replace(',','').str.replace('empty','').str.replace('byte',''))
    df['title']=i.split('/')[-1][:-15].replace('_',' ')
    dfs.append(df)

df=pd.concat(dfs,ignore_index=True)

df.to_csv('Data/conservative_pages.tsv',sep='\t',index=False)

#### Social issues

In [121]:
with tarfile.open('social_issue_pages.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('temp/*')]

df=pd.concat(dfs,ignore_index=True)

df.to_csv('Data/social_issue_pages2.tsv',sep='\t',index=False)

#### Science

In [3]:
with tarfile.open('science_pages.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t', parse_dates=[1]) for i in glob.glob('temp/*')]

df=pd.concat(dfs,ignore_index=True)

df.to_csv('Data/science_pages.tsv',sep='\t',index=False)

### Quality

In [142]:
with tarfile.open('../Data/social_issue_quality.tar.gz') as tar:
    tar.extractall('../Data/temp/')

In [143]:
dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('../Data/temp/*')]

In [144]:
df=pd.concat(dfs,ignore_index=True)

In [146]:
df.to_csv('../Data/social_issue_quality3.tsv',sep='\t',index=False)

### TF-IDF

All 3 corpora

In [2]:
with tarfile.open('article_tfidf.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('temp/*')]
df=pd.concat(dfs,ignore_index=True)

df=df.dropna()
df['freq']=pd.to_numeric(df['freq'])

x=set(df['title'])
title2id=dict(zip(x,xrange(len(x))))
id2title=np.array(list(x))

x=set(df['word'])
word2id=dict(zip(x,xrange(len(x))))

row=df['title'].map(lambda x: title2id[x])
col=df['word'].map(lambda x: word2id[x])

A=ssp.csr_matrix((df['freq'],(row,col)), shape=[len(title2id), len(word2id)])
d=(A>0).sum(axis=0)
d=d.A.flatten()
d=np.log(A.shape[0])-np.log(d)
A=A*ssp.diags(d)
r=A.sum(axis=1)
r=r.A.flatten()
tfidf=pd.DataFrame({'id':xrange(A.shape[0]),'tfidf':r})
tfidf['title']=tfidf['id'].map(lambda x: id2title[x])

tfidf.to_csv('Data/article_tfidf.tsv',sep='\t',index=False)

###  Word radius

In [10]:
with tarfile.open('article_radius.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('temp/*')]

df=pd.concat(dfs,ignore_index=True)

df.to_csv('Data/article_radius.tsv',sep='\t',index=False)

## Talk Pages

Each combined table contains information for all the 3 corpora

### Basic edit information

In [29]:
with tarfile.open('Data/talk_pages.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t', parse_dates=[1]) for i in glob.glob('temp/*')]

df=pd.concat(dfs,ignore_index=True)

df.to_csv('../Data/talk_pages3.tsv',sep='\t',index=False)

### TF-IDF

In [9]:
with tarfile.open('talk_page_tfidf3.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('temp/*')]
df=pd.concat(dfs,ignore_index=True)

df=df.dropna()
df['freq']=pd.to_numeric(df['freq'])

x=set(df['title'])
title2id=dict(zip(list(x),xrange(len(x))))
id2title=np.array(list(x))

x=set(df['word'])
word2id=dict(zip(list(x),xrange(len(x))))

row=df['title'].map(lambda x: title2id[x])
col=df['word'].map(lambda x: word2id[x])

A=ssp.csr_matrix((df['freq'],(row,col)), shape=[len(title2id), len(word2id)])
d=(A>0).sum(axis=0)
d=d.A.flatten()
d=np.log(A.shape[0])-np.log(d)
A=A*ssp.diags(d)
r=A.sum(axis=1)
r=r.A.flatten()
tfidf=pd.DataFrame({'id':xrange(A.shape[0]),'tfidf':r})
tfidf['title']=tfidf['id'].map(lambda x: id2title[x])
del tfidf['id']

tfidf.to_csv('Data/talk_page_tfidf3.tsv',sep='\t',index=False)

### Policy

In [47]:
with tarfile.open('talk_page_policy3.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('temp/*')]

df=pd.concat(dfs,ignore_index=True)

df.to_csv('Data/talk_page_policy.tsv',sep='\t',index=False)

### Attack score

In [3]:
with tarfile.open('talk_page_attack.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('temp/*')]
df=pd.concat(dfs,ignore_index=True)

df2=df.groupby('title').median()

df2=df2.reset_index()

df2.to_csv('Data/talk_page_attack3.tsv', sep='\t', index=False)

###  Word radius

In [15]:
with tarfile.open('talk_page_radius4.tar.gz') as tar:
    tar.extractall('temp/')

dfs=[pd.read_csv(i, sep='\t') for i in glob.glob('temp/*')]

df=pd.concat(dfs,ignore_index=True)

df.to_csv('Data/talk_page_radius4.tsv',sep='\t',index=False)