Permalink
Browse files

New tool: Article-editor intersection.

  • Loading branch information...
MER-C committed Oct 6, 2017
1 parent e551939 commit 0ffa36065a09517edcbbe3b4fab7e15e0dfd77a1
@@ -0,0 +1,97 @@
<!--
@(#)editorintersection.jsp 0.01 05/10/2017
Copyright (C) 2017 MER-C
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
-->
<%@ include file="header.jsp" %>
<%@ page contentType="text/html" pageEncoding="UTF-8"
trimDirectiveWhitespaces="true" %>
<%
request.setAttribute("toolname", "Article/editor intersection (beta)");
String pages = request.getParameter("pages");
if (pages == null)
pages = "";
else
pages = ServletUtils.sanitizeForHTML(pages);
boolean noadmin = (request.getParameter("noadmin") != null);
boolean nobot = (request.getParameter("nobot") != null);
boolean noanon = (request.getParameter("noanon") != null);
%>
<!doctype html>
<html>
<head>
<link rel=stylesheet href="styles.css">
<title><%= request.getAttribute("toolname") %></title>
</head>
<body>
<p>
This tool retrieves the common editors of a given set of articles. Limited to
the 1500 most recent revisions in each of 25 articles for now.
<form action="./editorintersection.jsp" method=POST>
<table>
<tr>
<td valign=top>Articles:<br>(one per line)
<td>
<textarea name=pages rows=10 required>
<%= pages %>
</textarea>
<tr>
<td>Exclude:
<td><input type=checkbox name=noadmin value=1<%= (pages.isEmpty() || noadmin) ? " checked" : "" %>>admins</input>
<input type=checkbox name=nobot value=1<%= (pages.isEmpty() || nobot) ? " checked" : "" %>>bots</input>
<input type=checkbox name=noanon value=1<%= noanon ? " checked" : "" %>>IPs</input>
</table>
<br>
<input type=submit value=Search>
</form>
<%
if (!pages.isEmpty())
{
out.println("<hr>");
String[] temp = pages.split("\r\n");
String[] pagesarray = Arrays.copyOf(temp, Math.min(temp.length, 24));
Wiki wiki = Wiki.createInstance("en.wikipedia.org");
wiki.setMaxLag(-1);
wiki.setQueryLimit(1500);
Map<String, List<Wiki.Revision>> results = ArticleEditorIntersection.articleEditorIntersection(wiki, pagesarray, noadmin, nobot, noanon);
for (Map.Entry<String, List<Wiki.Revision>> entry : results.entrySet())
{
out.println("<h2>" + entry.getKey() + "</h2>");
out.println(ParserUtils.generateUserLinks(wiki, entry.getKey()));
// group by article
Map<String, List<Wiki.Revision>> grouppage = entry.getValue()
.stream()
.collect(Collectors.groupingBy(Wiki.Revision::getPage));
for (Map.Entry<String, List<Wiki.Revision>> entry2 : grouppage.entrySet())
{
List<Wiki.Revision> revs = entry2.getValue();
out.print("<h3>" + entry2.getKey() + " (" + revs.size() + " edit");
out.println(revs.size() > 1 ? "s)</h3>" : ")</h3>");
out.println(ParserUtils.revisionsToHTML(wiki, revs.toArray(new Wiki.Revision[revs.size()])));
}
}
}
%>
<%@ include file="footer.jsp" %>
@@ -18,7 +18,9 @@
<%@ page import="java.io.*" %>
<%@ page import="java.util.*" %>
<%@ page import="java.util.stream.*" %>
<%@ page import="java.time.*" %>
<%@ page import="org.wikipedia.*" %>
<%@ page import="org.wikipedia.servlets.*" %>
<%@ page import="org.wikipedia.tools.*" %>
@@ -0,0 +1,158 @@
/**
* @(#)AllWikiLinksearch.java 0.03 26/12/2016
* Copyright (C) 2011 - 2017 MER-C
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version. Additionally
* this file is subject to the "Classpath" exception.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package org.wikipedia.tools;
import java.io.*;
import java.util.*;
import java.util.stream.*;
import org.wikipedia.Wiki;
/**
* This tool finds the common set of editors and corresponding revisions of a
* set of wiki pages. Useful for sockpuppet analysis. Servlet version (with
* slightly limited functionality) is available <a
* href="https://wikipediatools.appspot.com/editorintersection.jsp">here</a>.
*
* @version 0.01
* @author MER-C
*/
public class ArticleEditorIntersection
{
// TODO
// 1) Make offline mode fully functional, as opposed to just a test
// 2) Add category option to servlets
// 3) Add category option to offline mode
// 4) Collapsible revision lists
/**
* Runs this program.
* @param args the command line arguments
*/
public static void main(String[] args) throws IOException
{
Wiki enWiki = Wiki.createInstance("en.wikipedia.org");
String[] articles = enWiki.getCategoryMembers("Category:Indian general election, 2009", Wiki.MAIN_NAMESPACE);
Map<String, List<Wiki.Revision>> data = articleEditorIntersection(enWiki, articles, true, true, false);
for (Map.Entry<String, List<Wiki.Revision>> entry : data.entrySet())
{
System.out.print(entry.getKey());
System.out.println(" => {");
// group by article
Map<String, List<Wiki.Revision>> grouppage = entry.getValue()
.stream()
.collect(Collectors.groupingBy(Wiki.Revision::getPage));
for (Map.Entry<String, List<Wiki.Revision>> entry2 : grouppage.entrySet())
{
System.out.print("\t" + entry2.getKey());
System.out.print(" => ");
for (Wiki.Revision rev : entry2.getValue())
System.out.print(rev.getRevid() + " ");
System.out.println();
}
System.out.println("}");
}
}
/**
* Finds the set of common editors for a given set of <tt>articles</tt> on
* <tt>wiki</tt>.
*
* @param wiki the wiki to fetch content from
* @param articles a list of pages to analyze for common editors
* @param noadmin exclude admins from the analysis
* @param nobot exclude flagged bots from the analysis
* @param noanon exclude IPs from the analysis
* @return a map with user => list of revisions made
* @throws IOException if a network error occurs
*/
public static Map<String, List<Wiki.Revision>> articleEditorIntersection(Wiki wiki,
String[] articles, boolean noadmin, boolean nobot, boolean noanon) throws IOException
{
// fetch histories and group by user
Map<String, List<Wiki.Revision>> results = Arrays.stream(articles).flatMap(article ->
{
try
{
return Arrays.stream(wiki.getPageHistory(article));
}
catch (IOException ex)
{
return Arrays.stream(new Wiki.Revision[0]);
}
}).collect(Collectors.groupingBy(Wiki.Revision::getUser));
Iterator<Map.Entry<String, List<Wiki.Revision>>> iter = results.entrySet().iterator();
while (iter.hasNext())
{
Map.Entry<String, List<Wiki.Revision>> item = iter.next();
List<Wiki.Revision> list = item.getValue();
// throw out any account that appears in only one revision
if (list.size() < 2)
{
iter.remove();
continue;
}
// throw out any account that appears in only one article
Set<String> allpages = list.stream()
.map(Wiki.Revision::getPage)
.collect(Collectors.toCollection(HashSet::new));
if (allpages.size() < 2)
iter.remove();
}
// remove admins, bots and anons if necessary
Set<String> keyset = results.keySet();
if (noadmin || nobot || noanon)
{
String[] users = keyset.toArray(new String[0]);
Map<String, Object>[] userinfo = wiki.getUserInfo(users);
for (int i = 0; i < users.length; i++)
{
// skip IPs because getUserInfo returns null
if (userinfo[i] == null)
{
if (noanon)
keyset.remove(users[i]);
continue;
}
String[] groups = (String[])userinfo[i].get("groups");
for (String group : groups)
{
if (group.equals("sysop") && noadmin)
{
keyset.remove(users[i]);
continue;
}
if (group.equals("bot") && nobot)
{
keyset.remove(users[i]);
continue;
}
}
}
}
return results;
}
}
@@ -28,6 +28,10 @@
<servlet-name>UserWatchlist</servlet-name>
<jsp-file>/WEB-INF/classes/org/wikipedia/servlets/userwatchlist.jsp</jsp-file>
</servlet>
<servlet>
<servlet-name>Article-editor intersection</servlet-name>
<jsp-file>/WEB-INF/classes/org/wikipedia/servlets/editorintersection.jsp</jsp-file>
</servlet>
<servlet-mapping>
<servlet-name>Cross-wiki linksearch</servlet-name>
@@ -53,6 +57,11 @@
<servlet-name>UserWatchlist</servlet-name>
<url-pattern>/userwatchlist.jsp</url-pattern>
</servlet-mapping>
<servlet-mapping>
<servlet-name>Article-editor intersection</servlet-name>
<url-pattern>/editorintersection.jsp</url-pattern>
</servlet-mapping>
<session-config>
<session-timeout>30</session-timeout>
</session-config>
@@ -4,7 +4,7 @@ ol { margin: 15px }
/* fix input sizes */
input[type=text] { width: 300px }
textarea { width: 500px }
textarea { width: 700px }
/* error messages */
.error { color: red; size: 24pt }

0 comments on commit 0ffa360

Please sign in to comment.