# Code from original Sumi and Yasseri et al. (2011) paper. 

## Part I.: making the dataset

In [1]:
#making the dataset and things like that
import pandas as pd
from bs4 import BeautifulSoup

def read_local_xml(fp):
    '''
    Reads in an XML file and returns content as a list and a soup object
	:param fp: input filepath
	:return: tuple containing a list of lines and a soup object
    '''
    content = []
    # Read the XML file
    with open(fp, encoding='utf8') as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content_string = "".join(content)
        soup = BeautifulSoup(content_string, 'xml')
    return content, soup

def lightdump_one_article(fp, article_name):
	'''
	Reads in lightdump data and returns a list of all the lines of a single article
	:param fp: input filepath
	:param article_name: name of the article to return
	:return: list of the lines of the corresponding article
	'''
	with open(fp) as fh:
	    article = []
	    found = False
	    for line in fh:
	        if found and (line[0] != '^'):
	            break
	        if found:
	            article.append(line.strip())
	        if line.strip() == article_name:
	            found = True
	if found == False:
		return 'Article not found'

	return article[::-1]

def xml_to_dfs(fp):
    '''
    Reads in an XML file and writes the data into DataFrames
    :param fp: input filepath
    :return: list of article titles, list of corresponding article lightdump data as DataFrame
    '''
    with open(fp, encoding='utf8') as file:
        contents = file.read()
        soup = BeautifulSoup(contents,'xml')
        
    titles = []
    dfs = []
    
    for x in soup.findAll('page'):
        page_text = x.findAll('text', string = True)
        text_hash = list(map(hash, page_text))
        count = 1

        contributor_list = x.findAll('contributor')

        timestamps = x.findAll('timestamp')

        df = pd.DataFrame(columns = ['timestamp', 'revert', 'editNumber', 'contributor', 'hash'])

        for i in range(len(text_hash)):
            rowInfo = pd.Series(index = ['timestamp', 'revert', 'editNumber', 'contributor', 'hash'],
                               dtype = 'object')

            rowInfo['timestamp'] = timestamps[i].text
            rowInfo['hash'] = text_hash[i]

            if text_hash[i] in text_hash[:i]:
                rowInfo['revert'] = 1
                rowInfo['editNumber'] = df[df['hash'] == text_hash[i]]['editNumber'].iloc[0]

            else:
                rowInfo['revert'] = 0
                rowInfo['editNumber'] = count
                count += 1

            try:
                rowInfo['contributor'] = contributor_list[i].find('username').text
            except:
                rowInfo['contributor'] = contributor_list[i].find('ip').text

            df = df.append(rowInfo, ignore_index = True)

        title = x.find('title').text.replace(' ', '_')
        
        titles.append(title)
        
        df = df.drop('hash', axis = 1)
        dfs.append(df)
        
    return titles, dfs

def create_line(row):
    '''
    Helper function to convert DataFrame into string values for lightdump conversion
    '''
    line = "^^^_"

    line = line + str(row['timestamp']) + " "
    line = line + str(row['revert']) + " "
    line = line + str(row['editNumber']) + " "
    line = line + str(row['contributor'])

    return line

def write_lightdump(titles, dfs, fp):
    '''
    Reads in a list of titles and corresponding DataFrames
    and writes the data into lightdump txt format
    :param titles: list of titles
    :param dfs: list of corresponding DataFrames
    :param fp: output txt file path
    '''
    for i in range(len(titles)):

        with open(fp, 'a') as file:
            starting_line = titles[i] + '\n'
            
            file.write(starting_line)
        
        with open(fp, 'a') as file:
            df = dfs[i].iloc[::-1].apply(create_line, axis = 1)

            df.to_csv(fp, mode = 'a', header = False, index = False)  
            
            file.write('\n')
        
#         print("Writing page completed.")

            
def lightdump_read_n(fp, n = 100):
    '''
	Reads in n lightdump pages and returns a list of all titles 
    read and their corresponding data as a DataFrame
	:param fp: input filepath
	:param n: number of articles to read
	:return: list of article titles, list of corresponding article lightdump data as DataFrame
	'''
    titles = []
    dataframes = []

    with open(fp) as file:
        df = pd.DataFrame(columns = ['timestamp', 'revert', 'revision_id', 'user'])
        page = 0
        for line in file:
            if '^^^_' not in line:
                title = line.strip('\n').strip()
                titles.append(title)

                if title != titles[page]:
                    page += 1
                    
                    df['timestamp'] = pd.to_datetime(df['timestamp'])
                    
                    dataframes.append(df)
                    
                    df = pd.DataFrame(columns = ['timestamp', 'revert', 'revision_id', 'user'])

                    if page == n:
                        break
            else:
                data = line.strip("^^^_").strip('\n').split()
                row = pd.Series(dtype = 'object')

                row['timestamp'] = data[0]
                row['revert'] = int(data[1])
                row['revision_id'] = int(data[2])
                row['user'] = data[3]

                df = df.append(row, ignore_index = True)
    
    return titles, dataframes


## Part II. Build features

In [2]:
#to turn raw data into features for modeling
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import sqlite3

def mstat(article):
    '''
    Calculates the M-statistic for a list of edits from an article
    :param article: list of edits from an article
    :return: M-statistic for the article
    '''
    revert = 0
    revert_pairs = []

    #the list of mutual reverting pairs
    mutual_revert_pairs = []
    #the list of unique users among reverting pairs
    mutual_revert_users = []

    #a dictionary of user as key, and his number of edits as the value
    user_edits = {}

    #a dictionary with the line number (actual version number) as key, and the line label as value(i.e. line label is either the same as version number if not revert version, or equal to an older version number if it's a revert version)
    lineLabels = []
    #a dictionary with the line number (actual version number) as key, and the author of that line as value
    lineAuthors = []

    ### Helper Function ###
    def getLine(label, lineLabels):
        for line, ll in reversed(list(enumerate(lineLabels))):
            if lineLabels[line] == label:
                return line
    
    ### Read File ###
    for ln in article:
        parts = ln
        if len(parts) < 5:
            continue

        if parts[4] not in user_edits:
            user_edits[parts[4]] = 1
        else:
            user_edits[parts[4]] = user_edits[parts[4]] + 1
        if parts[2] == '1':
            revert += 1
            #the found line is the version i-1 equal to this version j, and the revert is assumed to be between the author of i, and j
            line = getLine(int(parts[3]), lineLabels)
            #ignore cases when i-1, and i are equal (consecutive versions)
            if line >= len(lineLabels)-1:
                continue
            revertedU = lineAuthors[line + 1]
            revertingU = parts[4]
            if revertedU == revertingU:
                continue
            pair = revertedU + "~!~" + revertingU
            if pair not in revert_pairs:
                revert_pairs.append(pair)
        lineLabels.append(int(parts[3]))
        lineAuthors.append(parts[4])

    ### Get Mutual ###
    for pair in revert_pairs:
        parts = pair.split("~!~")
        if parts[1] + "~!~" + parts[0] in revert_pairs:
            sorted_pair = ""
            if parts[0] < parts[1]:
                sorted_pair = parts[0] + "~!~" + parts[1]
            else:
                sorted_pair = parts[1] + "~!~" + parts[0]
                mutual_revert_pairs.append(sorted_pair)
            if parts[1] not in mutual_revert_users:
                mutual_revert_users.append(parts[1])
            if parts[0] not in mutual_revert_users:
                mutual_revert_users.append(parts[0])
        
        
    #calculating the score
    score = 0
    pairs = []
    for pair in list(set(mutual_revert_pairs)):
        parts = pair.split("~!~")
        u1 = parts[0]
        u2 = parts[1]
        if user_edits[u1]<user_edits[u2]:
            edit_min = user_edits[u1]
        else:
            edit_min = user_edits[u2]
        pairs.append(pair + ":" + str(edit_min))
        score += edit_min

    score *= len(mutual_revert_users)    
    
    return score

def ld_to_sql(ld_fp, db_fp, chunksize=5000000):
    '''
    Converts light dump from text file to tables in a SQLite database
    :param ld_fp: input light dump filepath
    :param db_fp: output database filepat
    :param chunksize: chunksize to hold in memory at a time before appending to SQL, default 50000000
    '''
    con = sqlite3.connect(db_fp)
    articles_cols = ['article_id', 'article_name', 'num_edits', 'm']
    edits_cols = ['article_id', 'timestamp', 'revert', 'edit_id', 'username']
    pd.DataFrame(columns=articles_cols).to_sql('articles', con, if_exists='replace', index=False)
    pd.DataFrame(columns=edits_cols).to_sql('edits', con, if_exists='replace', index=False)
    
    with open(ld_fp) as fh:
        articles_data = []  
        edits_data = []
        article_id = 0
        first = True                   # first article
        num_lines = 0                  # number of lines read
        num_edits = 0                  # number of edits in current article
        for line in fh:
            line = line.strip()
            if len(line) == 0:
                continue
            
            # append to sql
            if num_lines % chunksize == 0:
                articles_df = pd.DataFrame(articles_data, columns=articles_cols)
                edits_df = pd.DataFrame(edits_data, columns=edits_cols)
                articles_df.to_sql('articles', con, if_exists='append', index=False)
                edits_df.to_sql('edits', con, if_exists='append', index=False)
                articles_data = []                               # reset variables
                edits_data = []
                
            # article name line    
            if line[0] != '^':
                if first:              # check if first article
                    first = False
                else:                  # append to lists
                    current_article = current_article[::-1]
                    # calculate m
                    if num_edits < 3:
                        m = 0
                    else:
                        m = mstat(current_article)
                    articles_data.append([article_id, article_name, num_edits, m])
                    edits_data += current_article
                    article_id += 1
                # reset variables
                article_name = line
                current_article = []
                num_edits = 0
            
            # add to current article
            else:
                line = line.split(' ')
                current_article.append([article_id] + line)
                num_edits += 1
            num_lines += 1
        # final article
        current_article = current_article[::-1]
        # calculate m
        if num_edits < 3:
            m = 0
        else:
            m = mstat(current_article)
        articles_data.append([article_id, article_name, num_edits, m])
        edits_data += current_article
        
        # to sql
        articles_df = pd.DataFrame(articles_data, columns=articles_cols)
        edits_df = pd.DataFrame(edits_data, columns = edits_cols)
        articles_df.to_sql('articles', con, if_exists='append', index=False)
        edits_df.to_sql('edits', con, if_exists='append', index=False)
        
def query_articles(db_fp, N=None):
    '''
    Queries articles from the SQL database
    :param db_fp: input database filepath
    :param N: Number of articles to query, default all articles
    :return: dataframe of articles read in
    '''
    conn = sqlite3.connect(db_fp)
    if N:
        df =  pd.read_sql('select * from articles limit {0}'.format(N), conn)
    else:
        df =  pd.read_sql('select * from articles', conn)
    df['m'] = df['m'].astype(int)
    df['num_edits'] = df['num_edits'].astype(int)
    return df

## Part III. : Visualize

In [3]:
#creating explanatory and results oriented visualizations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

def counts_vs_m_distribution_plots(articles_df, outdir):
    '''
    Creates violin plots cross-checking top 20 edit counts versus high m
    :param articles_df: dataframe of articles from sql database
    :param outdir: output directory for plots
    :param N: number of edits and Ms to study
    '''
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    #top 20 edited articles
    top_20_edited = articles_df.sort_values('num_edits', ascending = False).head(20)

    #top 20 m-scores
    top_20_m = articles_df.sort_values('m', ascending = False).head(20)
    
    top_20_m['top 20 edited'] = top_20_m['article_name'].isin(top_20_edited['article_name'].tolist())
    sns.violinplot(data = top_20_m, x = 'top 20 edited', y = 'm', ax=axes[0])
    sns.violinplot(data = top_20_m, x = 'top 20 edited', y = 'num_edits', ax=axes[1])
    
    fig.savefig(os.path.join(outdir, 'counts_vs_m_violin.png'))
    
def nonzero_distribution_plots(articles_df, outdir):
    '''
    Creates a histogram and violin plots of the distributions of log(m) for articles with nonzero M
    :param articles_df: dataframe of articles from sql database
    :param outdir: output directory for plots
    '''
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # nonzero M
    nonzero = articles_df.copy().loc[articles_df['m'] > 0]
    nonzero['log_m'] = np.log(nonzero['m'])
    
    # plot histogram
    sns.distplot(nonzero['log_m'], ax=axes[0])
    
    # plot violin
    sns.violinplot(nonzero['log_m'], ax=axes[1])
    
    fig.savefig(os.path.join(outdir, 'nonzero_distribution.png'))
    
def m_div_counts_distribution_plots(articles_df, outdir):
    '''
    Creates a histogram and violin plots of the distributions of (log(m) / edit counts) for articles with nonzero M
    :param articles_df: dataframe of articles from sql database
    :param outdir: output directory for plots
    '''
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # nonzero M and edit counts
    articles_df = articles_df.copy()
    articles_df['log_m'] = np.log(articles_df['m'])
    non_x2 = articles_df.copy().loc[(articles_df['m'] > 0) & (articles_df['num_edits'] > 0)]
    non_x2['log_m/edits'] = non_x2['log_m']/non_x2['num_edits']
    
    # plot histogram
    sns.distplot(non_x2['log_m/edits'], ax=axes[0])
    
    # plot violin
    sns.violinplot(non_x2['log_m/edits'], ax=axes[1])
    
    fig.savefig(os.path.join(outdir, 'log_m_div_counts_distribution.png'))
    
def counts_vs_m_scatter_plot(articles_df, outdir):
    '''
    Creates a scatterplot comparing edit counts to M-stat for articles with nonzero M
    :param articles_df: dataframe of articles from sql database
    :param outdir: output directory for plot
    '''
    nonzero = articles_df.copy().loc[articles_df['m'] > 0]
    fig = sns.regplot(data = nonzero, x = 'num_edits', y = 'm').get_figure()
    fig.savefig(os.path.join(outdir, 'counts_vs_m_scatter.png'))
    
def descriptive_stats(articles_df, outdir):
    '''
    Descriptive statistic tables for top 20 and top 100 highest M-stat articles
    :param articles_df: dataframe of articles from sql database
    :param outdir: output directory for plot
    '''
    sorted_df = articles_df.sort_values('m', ascending = False)
    sorted_df.head(20).describe().to_csv(os.path.join(outdir, 'top_20_stats.csv'))
    sorted_df.head(100).describe().to_csv(os.path.join(outdir, 'top_100_stats.csv'))

    
def generate_stats(articles_df, outdir):
    '''
    Generates all EDA plots
    :param articles_df: dataframe of articles from sql database
    :param outdir: output directory for plot
    '''
    counts_vs_m_distribution_plots(articles_df, outdir)
    nonzero_distribution_plots(articles_df, outdir)
    m_div_counts_distribution_plots(articles_df, outdir)
    counts_vs_m_scatter_plot(articles_df, outdir)
    descriptive_stats(articles_df, outdir)
    return

## Part IV: Combining all formerly defined functions [NOT READY TO RUN]

In [None]:
#!/usr/bin/env python

import sys
import json

sys.path.insert(0, 'src/data')
sys.path.insert(0, 'src/features')
sys.path.insert(0, 'src/visualization')

from make_dataset import *
from build_features import *
from visualize import *

def main(targets):
    sql_config = json.load(open('config/data-db-params.json'))
    eda_config = json.load(open('config/eda-params.json'))
    all_config = json.load(open('config/all-params.json'))
    test_config = json.load(open('config/test-params.json'))
        
    if 'data-db' in targets:
        wiki_fp = sql_config['wiki_fp']
        db_fp = sql_config['db_outfp']
        ld_to_sql(wiki_fp, db_fp)
        
    if 'eda' in targets:
        outdir = eda_config['outdir']
        db_fp = eda_config['db_infp']
        articles_df = query_articles(db_fp)
        generate_stats(articles_df, outdir)
        
    if 'all' in targets:
        # assumes english wikipedia light dump was downloaded into data/raw as 'en-wiki.txt'
        lightdump_fp = all_config['data_fp']
        db_outfp = all_config['db_outfp']
        outdir = all_config['outdir']
        db_infp = all_config['db_fp']
        
        ld_to_sql(lightdump_fp, db_outfp)
        print('Created database from lightdump')
        
        articles_df = query_articles(db_infp)
        generate_stats(articles_df, outdir)
        print('Generated EDA plots on database')
        
    if 'test' in targets:
        lightdump_fp = test_config['data_fp']
        db_outfp = test_config['db_outfp']
        outdir = test_config['outdir']
        db_infp = test_config['db_fp']
        
        ld_to_sql(lightdump_fp, db_outfp)
        print('Created database from lightdump')
        
        articles_df = query_articles(db_infp)
        generate_stats(articles_df, outdir)
        print('Generated EDA plots on database')
        
    else:
        print('You did not pass in any arguments!')

if __name__ == '__main__':
    # run via:
    # python main.py data model
    targets = sys.argv[1:]
    main(targets)


## Part V.: Overview statistics of resulting data [Not run yet]

### Base Descriptive Statistics

In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [None]:
#import necessary libraries
import sys
import json
import seaborn as sns
import numpy as np

#add paths to access custom library features
sys.path.insert(0, '../src/features')

#import custom library features
from build_features import *

#read the config file
with open('../config/eda-params.json') as f:
    eda_config = json.load(f)
    
outdir = eda_config['outdir']
db_fp = eda_config['db_infp']

#read in database as DataFrame
articles_df = query_articles(db_fp)

#change type to int so we can aggregate
articles_df['num_edits'] = articles_df['num_edits'].astype(int)
articles_df['m'] = articles_df['m'].astype(int)

In [None]:
print("There are", str(len(articles_df)), "on English Wikipedia")

In [None]:
#descriptive stats for num edits
articles_df['num_edits'].describe()

In [None]:
#histogram of distribution of edits for 25% of data
articles_df.loc[articles_df['num_edits'] > 31]['num_edits'].hist(bins = 100)

In [None]:
articles_df['m'].describe()

In [None]:
articles_df.loc[articles_df['m'] > 0]['m'].describe()

In [None]:
total_pages = len(articles_df)
m_zero = articles_df.loc[articles_df['m'] == 0]
non_zero_m_edits = len(articles_df.loc[(articles_df['m'] > 0) &\
                          (articles_df['num_edits'] > 0)])

print("Total English Wikipedia pages:", total_pages)
print("\n")

print("Pages with M-stat of 0:", len(m_zero))
print("Pages with M-stat of 0 and edit count > 0:",
     len(m_zero.loc[m_zero['num_edits'] > 0]))
print("\n")

print("Pages with M-stat and edit count > 0:", non_zero_m_edits)
print("Proportion of pages with M-stat and edit count > 0:",
      non_zero_m_edits/total_pages)

In [None]:
log_m = np.log(articles_df[articles_df['m'] > 0]['m'])

hs = sns.distplot(log_m)
hs.set_title('Histogram of Number of Edits/M-statistic Ratio Distribution',
             fontsize=12)

In [None]:
vp = sns.violinplot(log_m)
vp.set_title('Distribution of (log) M-statistic', fontsize=12)

In [None]:
nonzero = articles_df.loc[articles_df['m'] > 0]

In [None]:
sc = sns.regplot(data = nonzero, x = 'num_edits', y = 'm')
sc.set_title('Number of Edits v. M-statistic')

### Top 100 M-statistic Articles

In [None]:
top_100_edited = articles_df.sort_values('num_edits', ascending = False).head(100)

top_100_m = articles_df.sort_values('m', ascending = False).head(100)

top_100_zero = articles_df.sort_values(['m', 'num_edits'], ascending = [True, False]).head(100)

In [None]:
top_100_m.loc[top_100_m['article_name'].isin(top_100_edited['article_name'].tolist())].describe()

In [None]:
top_100_m.loc[~top_100_m['article_name'].isin(top_100_edited['article_name'].tolist())].describe()

In [None]:
top_20_edited.head(10)

In [None]:
hs = sns.distplot(log_m/nonzero['num_edits'], axlabel = "number of edits/M-stat")
hs.set_title('Histogram of Number of Edits/M-statistic Ratio Distribution',
             fontsize=12);

In [None]:
vp = sns.violinplot(log_m/nonzero['num_edits'])
vp.set_title('Distribution of Number of Edits/M-statistic Ratio',
             fontsize=12);