Skip to content

Commit

Permalink
Attempting to resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
drewconway committed Nov 6, 2011
2 parents 34223b0 + 9d4412c commit 065914b
Show file tree
Hide file tree
Showing 11 changed files with 356 additions and 84 deletions.
11 changes: 10 additions & 1 deletion SF_2011/Benetech/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,13 @@
.RData
.RHistory
*.tex
*.log
*.log
*.aux
*.out
*.nav
*.pdf
*.pdfsync
*.snm
*.synctex.gz
*.toc
*.swp
36 changes: 32 additions & 4 deletions SF_2011/Benetech/R/benetech_report.rnw
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
% benetech_report.rnw
%
% Created on 2011-11-05
%
%
%
\documentclass[xcolor=dvipsnames, 9pt,handout]{beamer}

Expand Down Expand Up @@ -57,16 +57,44 @@
\institute{\includegraphics[width = 3cm]{Benetech-Logo-tag.jpg}}

\begin{document}

% The data frame is called 'benetech'
<<echo=false>>=
source('load_data.R')
@

\begin{frame}[plain]
\titlepage
\titlepage
\end{frame}

\begin{frame}[plain]
\frametitle{Average account longevity}
\begin{block}{Question}
How does (date.last.saved - date.authorized) vary as a function of date.authorized?
\end{block}
Some notes about the dataset:
\begin{itemize}
\item Many authorized dates are NA:
<<<echo=T, eval=T>>=
xtable(is.na(wdf$authorized_date))
@
\item Some last saved at dates are before the authorized dates:
\item $\Sexpr{round(3/2)}$
\end{itemize}
\end{frame}

<<<<<<< HEAD
=======
\begin{frame}
\frametitle{Global Martus Use}
<<echo=false, include=false>>=
source('11_map_report.R')
@
\includegraphics[width=11cm]{benetech_report-map_plot.pdf} \\
As of \today
\end{frame}

>>>>>>> 9d4412c0ae24a168e106f195a1ab47127b8ffb94
\begin{frame}{Database Usage Statistics}

\begin{table}[ht]
Expand Down Expand Up @@ -155,4 +183,4 @@ As of \today
\end{frame}


\end{document}
\end{document}
7 changes: 7 additions & 0 deletions SF_2011/Benetech/R/common_summaries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
with_summaries <- subset(benetech, nchar(summary) > 0)
with_english_summaries <- subset(with_summaries, language == "en")
write.csv(with_english_summaries$summary, "with-summaries.csv", col.names = FALSE, row.names = FALSE, quote=FALSE)

# then use
# cat with-summaries.csv | tr " " "\n" | sort | uniq
# then invoke common_words.py
4 changes: 2 additions & 2 deletions SF_2011/Benetech/R/load_data.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# File-Name: load_data.R
# Date: 2011-11-05
# Author: Drew Conway
# Email: drew.conway@nyu.edu
# Email: drew.conway@nyu.edu
# Purpose: Load and cloean the Benetech data
# Data Used: ../data/martus-bullacct-4datadive-2011-11-03.csv
# Machine: Drew Conway's MacBook Pro

# Copyright (c) 2011, under the Simplified BSD License.
# Copyright (c) 2011, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
# All rights reserved.

Expand Down
Empty file.
2 changes: 2 additions & 0 deletions SF_2011/Benetech/R/q12.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# count of accounts that were only active for 1 day
# by organization / magic word group / server and grand total
100 changes: 100 additions & 0 deletions SF_2011/Benetech/R/q5.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# average account longevity
# look at the distribution of date.authorized vs. (date.last.saved - date.authorized)

# loading in a small dataset rather than the whole thing for now
require("plyr")
require("lattice")
require("xtable")

source("load_data.R")

# 'working data frame'
wdf <- with(benetech, data.frame(
public_code = public.code,
authorized_date = date.authorized,
last_saved_date = date.last.saved))

# getting time differences
wdf$authorized_date_lt <- as.POSIXlt(wdf$authorized_date)
wdf$time_diff <- with(wdf, last_saved_date - authorized_date)
wdf$time_diff_int <- as.integer(wdf$time_diff)

# getting the month, year, quarter and halves
wdf <- transform(wdf,
authorized_date_month = authorized_date_lt$mon,
authorized_date_year = authorized_date_lt$year + 1900)

wdf$time_group_quarters <- paste(wdf$authorized_date_year,
"Q", ceiling(wdf$authorized_date_month / 4),
sep="")

wdf$time_group_halves <- paste(wdf$authorized_date_year,
"H", ceiling(wdf$authorized_date_month / 6),
sep="")

# there are NA's in date.authorized
wdf_na_df <- data.frame(table(is.na(wdf$authorized_date)))
colnames(wdf_na_df) <- c("is.na?", "count")

# there are cases where date.last.saved - date.authorized is very negative
# some show date.authorized as epoch… discarding those for now
wdf_neg_time_diff <- data.frame(table(wdf$time_diff < 0))
colnames(wdf_neg_time_diff) <- c("negative time diff", "count")

# working set excludes NA authorized dates and negative time diffs
working_set <- wdf[!is.na(wdf$authorized_date) & wdf$time_diff > 0, ]
working_set$time_group_quarters <- as.factor(working_set$time_group_quarters)
working_set$time_group_halves <- as.factor(working_set$time_group_halves)

# plots
ggplot(data=working_set, aes(x=time_group_halves, y=time_diff_int/60/60/24)) +
geom_point() + geom_boxplot() + opts(axis.text.x=theme_text(angle=-90))

# writing to csv
summary_function <- function(df, colname) {
column <- df[[colname]]
return(
data.frame(
time_diff_min_seconds = min(column),
time_diff_q2_seconds = as.numeric(quantile(column, 0.2)),
time_diff_mean_seconds = mean(column),
time_diff_median_seconds = median(column),
time_diff_q8_seconds = as.numeric(quantile(column, 0.8)),
time_diff_max_seconds = max(column)
))
}


# without summarizing by account
by_quarter <- ddply(working_set, .(time_group_quarters), summary_function, "time_diff_int")
by_half_year <- ddply(working_set, .(time_group_halves), summary_function, "time_diff_int")
by_year <- ddply(working_set, .(authorized_date_year), summary_function, "time_diff_int")

write.csv(by_half_year, file="longevity_summary_by_half_year.csv")

# let's examine by accounts. Do some account have much higher longevity?
# what's the count by account? we should remove the ones with low counts…
by_account <- ddply(working_set, .(public_code), nrow)
colnames(by_account) <- c("public_code", "count")

by_account_quarter <- ddply(working_set, .(public_code, time_group_quarters), summary_function, "time_diff_int")
by_account_quarter <- join(by_account, by_account_quarter)

ggplot(data=by_account_quarter, aes(x=count, y=time_diff_median_seconds/60/60)) +
geom_point() + coord_trans(x="log10") + opts(axis.text.x=theme_text(angle=-90))

ggplot(data=by_account_quarter, aes(x=count, y=time_diff_mean_seconds/60/60)) +
geom_point() + geom_boxplot(aes(group=round_any(log10(count), 1))) +
opts(axis.text.x=theme_text(angle=-90))

# huge range for accounts who use this once, pretty stead for people who use this more

# group=round_any(log10(by_account_quarter$count), 1)
# with(working_set, plot(time_diff ~ authorized_date))
# xyplot(time_diff ~ authorized_date, data=working_set, scales="free")
#
# ggplot(data=working_set, aes(x=authorized_date, y=time_diff, group=round_a)) + geom_boxplot()


# graveyard
# wdf$authorized_date_group <- apply(wdf$authorized_date_lt, get_date_group_2)
23 changes: 23 additions & 0 deletions SF_2011/Benetech/python/common_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import numpy

word_to_count = {}
for line in body:
count_and_word = line.lstrip().rstrip().split(" ")
count = count_and_word[0]
if len(count_and_word) == 2:
word = count_and_word[1]
else:
word = ""
word_to_count[word] = count

word_to_count.values
numpy.array(word_to_count.keys())[numpy.argsort(word_to_count.values())][-50:]
# array(['formed', 'Thanpyuzayart', 'ya', 'fruit', '20', 'following',
# 'Division', 'Time)', 'what', 'Operation', '2007', 'endure', 'been',
# 'leader', 'most', 'demanded', 'participation', 'physical', 'arrest',
# 'should', 'tried', 'Kaw', 'when', 'as', 'Nam', '2011', 'taking',
# 'place', 'sent', 'Zaw', 'Tun', 'over', 'Namkham', 'Tin', 'health',
# 'outside', 'I', 'While', 'Moe', 'Lay', 'prisons', 'Three',
# 'further', 'reported', '(First', 'according', "don't", 'If', 'rice',
# 'and'],
# dtype='|S78')
37 changes: 37 additions & 0 deletions SF_2011/Benetech/python/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# parses the files into (file_name, public_id, bulletin_id)
# to be used after
# wget -l 1 -m -c "https://martus.ceu.hu/servlet/DoSearch?typeOfSearch=quickSearchAll"
import re
import csv
import os

out_file = file("url_public_id.csv", "w")

for file_name in os.listdir("."):
if (re.compile("^FoundBulletin").match(file_name)):
id = file_name.split("FoundBulletin?index=")[1].split("&")[0]
try:
body = file(file_name, "r").read()
title = body.split("<strong>")[1].split("</strong>")[0]
public_id = body.split("<!--Account Public Code = ")[1].split(" -->")[0]
bulletin_id = body.split("<!--Bulletin Local Id = ")[1].split(" -->")[0]
out_file.write("%s,%s,%s\n" % (title, public_id, bulletin_id))
except:
print "Cannot parse %s" % file_name

out_file.close()


for link in br.links(url_regex="FoundBulletin*"):
response = br.follow_link(link)
html_response = response.read()
title = html_response.split("<strong>")[1].split("</strong>")[0]
public_id = html_response.split("<!--Account Public Code = ")[1].split(" -->")[0]
bulletin_id = html_response.split("<!--Bulletin Local Id = ")[1].split(" -->")[0]
print (title, public_id, bulletin_id)
print(html_response)
out_file.write("%s,%s,%s\n" % (title, public_id, bulletin_id))


out_file.close

Loading

0 comments on commit 065914b

Please sign in to comment.