Attempting to resolve conflicts

JoeGermuska · Nov 6, 2011 · 065914b · 065914b
2 parents 34223b0 + 9d4412c
commit 065914b
Show file tree

Hide file tree

Showing 11 changed files with 356 additions and 84 deletions.
diff --git a/SF_2011/Benetech/.gitignore b/SF_2011/Benetech/.gitignore
@@ -2,4 +2,13 @@
 .RData
 .RHistory
 *.tex
-*.log
+*.log
+*.aux
+*.out
+*.nav
+*.pdf
+*.pdfsync
+*.snm
+*.synctex.gz
+*.toc
+*.swp
diff --git a/SF_2011/Benetech/R/benetech_report.rnw b/SF_2011/Benetech/R/benetech_report.rnw
@@ -2,7 +2,7 @@
 %  benetech_report.rnw
 %
 %  Created on 2011-11-05
-% 
+%
 %
 \documentclass[xcolor=dvipsnames, 9pt,handout]{beamer}
 
@@ -57,16 +57,44 @@
 \institute{\includegraphics[width = 3cm]{Benetech-Logo-tag.jpg}}
 
 \begin{document}
-  
+
 % The data frame is called 'benetech'
 <<echo=false>>=
 source('load_data.R')
 @
 
 \begin{frame}[plain]
-  \titlepage  
+  \titlepage
+\end{frame}
+
+\begin{frame}[plain]
+  \frametitle{Average account longevity}
+  \begin{block}{Question}
+    How does (date.last.saved - date.authorized) vary as a function of date.authorized?
+  \end{block}
+  Some notes about the dataset:
+  \begin{itemize}
+    \item Many authorized dates are NA:
+<<<echo=T, eval=T>>=
+xtable(is.na(wdf$authorized_date))
+@
+    \item Some last saved at dates are before the authorized dates:
+    \item $\Sexpr{round(3/2)}$
+  \end{itemize}
+\end{frame}
+
+<<<<<<< HEAD
+=======
+\begin{frame}
+  \frametitle{Global Martus Use}
+<<echo=false, include=false>>=
+source('11_map_report.R')
+@
+\includegraphics[width=11cm]{benetech_report-map_plot.pdf} \\ 
+As of \today
 \end{frame}
 
+>>>>>>> 9d4412c0ae24a168e106f195a1ab47127b8ffb94
 \begin{frame}{Database Usage Statistics}
 
 \begin{table}[ht]
@@ -155,4 +183,4 @@ As of \today
 \end{frame}
 
 
-\end{document}
+\end{document}
diff --git a/SF_2011/Benetech/R/common_summaries.R b/SF_2011/Benetech/R/common_summaries.R
@@ -0,0 +1,7 @@
+with_summaries <- subset(benetech, nchar(summary) > 0)
+with_english_summaries <- subset(with_summaries, language == "en")
+write.csv(with_english_summaries$summary, "with-summaries.csv", col.names = FALSE, row.names = FALSE, quote=FALSE)
+
+# then use
+# cat with-summaries.csv | tr " " "\n" | sort | uniq
+# then invoke common_words.py
diff --git a/SF_2011/Benetech/R/load_data.R b/SF_2011/Benetech/R/load_data.R
@@ -1,12 +1,12 @@
 # File-Name:       load_data.R
 # Date:            2011-11-05
 # Author:          Drew Conway
-# Email:           drew.conway@nyu.edu                                      
+# Email:           drew.conway@nyu.edu
 # Purpose:         Load and cloean the Benetech data
 # Data Used:       ../data/martus-bullacct-4datadive-2011-11-03.csv
 # Machine:         Drew Conway's MacBook Pro
 
-# Copyright (c) 2011, under the Simplified BSD License.  
+# Copyright (c) 2011, under the Simplified BSD License.
 # For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
 # All rights reserved.
 

diff --git a/SF_2011/Benetech/R/public_title_to_id.csv_ b/SF_2011/Benetech/R/public_title_to_id.csv_
diff --git a/SF_2011/Benetech/R/q12.R b/SF_2011/Benetech/R/q12.R
@@ -0,0 +1,2 @@
+# count of accounts that were only active for 1 day
+# by organization / magic word group / server and grand total
diff --git a/SF_2011/Benetech/R/q5.R b/SF_2011/Benetech/R/q5.R
@@ -0,0 +1,100 @@
+# average account longevity
+# look at the distribution of date.authorized vs. (date.last.saved - date.authorized)
+
+# loading in a small dataset rather than the whole thing for now
+require("plyr")
+require("lattice")
+require("xtable")
+
+source("load_data.R")
+
+# 'working data frame'
+wdf <- with(benetech, data.frame(
+  public_code     = public.code,
+  authorized_date = date.authorized,
+  last_saved_date = date.last.saved))
+
+# getting time differences
+wdf$authorized_date_lt <- as.POSIXlt(wdf$authorized_date)
+wdf$time_diff     <- with(wdf, last_saved_date - authorized_date)
+wdf$time_diff_int <- as.integer(wdf$time_diff)
+
+# getting the month, year, quarter and halves
+wdf <- transform(wdf,
+authorized_date_month = authorized_date_lt$mon,
+authorized_date_year  = authorized_date_lt$year + 1900)
+
+wdf$time_group_quarters <- paste(wdf$authorized_date_year,
+"Q", ceiling(wdf$authorized_date_month / 4),
+sep="")
+
+wdf$time_group_halves <- paste(wdf$authorized_date_year,
+"H", ceiling(wdf$authorized_date_month / 6),
+sep="")
+
+# there are NA's in date.authorized
+wdf_na_df <- data.frame(table(is.na(wdf$authorized_date)))
+colnames(wdf_na_df) <- c("is.na?", "count")
+
+# there are cases where date.last.saved - date.authorized is very negative
+# some show date.authorized as epoch… discarding those for now
+wdf_neg_time_diff <- data.frame(table(wdf$time_diff < 0))
+colnames(wdf_neg_time_diff) <- c("negative time diff", "count")
+
+# working set excludes NA authorized dates and negative time diffs
+working_set <- wdf[!is.na(wdf$authorized_date) & wdf$time_diff > 0, ]
+working_set$time_group_quarters <- as.factor(working_set$time_group_quarters)
+working_set$time_group_halves <- as.factor(working_set$time_group_halves)
+
+# plots
+ggplot(data=working_set, aes(x=time_group_halves, y=time_diff_int/60/60/24)) +
+  geom_point() + geom_boxplot() + opts(axis.text.x=theme_text(angle=-90))
+
+# writing to csv
+summary_function <- function(df, colname) {
+  column <- df[[colname]]
+  return(
+    data.frame(
+      time_diff_min_seconds    = min(column),
+      time_diff_q2_seconds     = as.numeric(quantile(column, 0.2)),
+      time_diff_mean_seconds   = mean(column),
+      time_diff_median_seconds = median(column),
+      time_diff_q8_seconds     = as.numeric(quantile(column, 0.8)),
+      time_diff_max_seconds    = max(column)
+    ))
+}
+
+
+# without summarizing by account
+by_quarter   <- ddply(working_set, .(time_group_quarters),  summary_function, "time_diff_int")
+by_half_year <- ddply(working_set, .(time_group_halves),    summary_function, "time_diff_int")
+by_year      <- ddply(working_set, .(authorized_date_year), summary_function, "time_diff_int")
+
+write.csv(by_half_year, file="longevity_summary_by_half_year.csv")
+
+# let's examine by accounts. Do some account have much higher longevity?
+# what's the count by account? we should remove the ones with low counts…
+by_account <- ddply(working_set, .(public_code), nrow)
+colnames(by_account) <- c("public_code", "count")
+
+by_account_quarter <- ddply(working_set, .(public_code, time_group_quarters), summary_function, "time_diff_int")
+by_account_quarter <- join(by_account, by_account_quarter)
+
+ggplot(data=by_account_quarter, aes(x=count, y=time_diff_median_seconds/60/60)) +
+  geom_point() + coord_trans(x="log10") + opts(axis.text.x=theme_text(angle=-90))
+
+ggplot(data=by_account_quarter, aes(x=count, y=time_diff_mean_seconds/60/60)) +
+  geom_point() + geom_boxplot(aes(group=round_any(log10(count), 1))) +
+  opts(axis.text.x=theme_text(angle=-90))
+
+# huge range for accounts who use this once, pretty stead for people who use this more
+
+# group=round_any(log10(by_account_quarter$count), 1)
+# with(working_set, plot(time_diff ~ authorized_date))
+# xyplot(time_diff ~ authorized_date, data=working_set, scales="free")
+#
+# ggplot(data=working_set, aes(x=authorized_date, y=time_diff, group=round_a)) + geom_boxplot()
+
+
+# graveyard
+# wdf$authorized_date_group <- apply(wdf$authorized_date_lt, get_date_group_2)
diff --git a/SF_2011/Benetech/python/common_words.py b/SF_2011/Benetech/python/common_words.py
@@ -0,0 +1,23 @@
+import numpy
+
+word_to_count = {}
+for line in body:
+  count_and_word = line.lstrip().rstrip().split(" ")
+  count = count_and_word[0]
+  if len(count_and_word) == 2:
+    word = count_and_word[1]
+  else:
+    word = ""
+  word_to_count[word] = count
+
+word_to_count.values
+numpy.array(word_to_count.keys())[numpy.argsort(word_to_count.values())][-50:]
+# array(['formed', 'Thanpyuzayart', 'ya', 'fruit', '20', 'following',
+#        'Division', 'Time)', 'what', 'Operation', '2007', 'endure', 'been',
+#        'leader', 'most', 'demanded', 'participation', 'physical', 'arrest',
+#        'should', 'tried', 'Kaw', 'when', 'as', 'Nam', '2011', 'taking',
+#        'place', 'sent', 'Zaw', 'Tun', 'over', 'Namkham', 'Tin', 'health',
+#        'outside', 'I', 'While', 'Moe', 'Lay', 'prisons', 'Three',
+#        'further', 'reported', '(First', 'according', "don't", 'If', 'rice',
+#        'and'],
+#       dtype='|S78')
diff --git a/SF_2011/Benetech/python/parser.py b/SF_2011/Benetech/python/parser.py
@@ -0,0 +1,37 @@
+# parses the files into (file_name, public_id, bulletin_id)
+# to be used after
+# wget -l 1 -m -c "https://martus.ceu.hu/servlet/DoSearch?typeOfSearch=quickSearchAll"
+import re
+import csv
+import os
+
+out_file = file("url_public_id.csv", "w")
+
+for file_name in os.listdir("."):
+  if (re.compile("^FoundBulletin").match(file_name)):
+    id = file_name.split("FoundBulletin?index=")[1].split("&")[0]
+    try:
+      body        = file(file_name, "r").read()
+      title       = body.split("<strong>")[1].split("</strong>")[0]
+      public_id   = body.split("<!--Account Public Code = ")[1].split(" -->")[0]
+      bulletin_id = body.split("<!--Bulletin Local Id = ")[1].split(" -->")[0]
+      out_file.write("%s,%s,%s\n" % (title, public_id, bulletin_id))
+    except:
+      print "Cannot parse %s" % file_name
+
+out_file.close()
+
+
+for link in br.links(url_regex="FoundBulletin*"):
+  response      = br.follow_link(link)
+  html_response = response.read()
+  title         = html_response.split("<strong>")[1].split("</strong>")[0]
+  public_id     = html_response.split("<!--Account Public Code = ")[1].split(" -->")[0]
+  bulletin_id   = html_response.split("<!--Bulletin Local Id = ")[1].split(" -->")[0]
+  print (title, public_id, bulletin_id)
+  print(html_response)
+  out_file.write("%s,%s,%s\n" % (title, public_id, bulletin_id))
+
+
+out_file.close
+