forked from datakind/Datadives
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
356 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,13 @@ | |
.RData | ||
.RHistory | ||
*.tex | ||
*.log | ||
*.log | ||
*.aux | ||
*.out | ||
*.nav | ||
*.pdfsync | ||
*.snm | ||
*.synctex.gz | ||
*.toc | ||
*.swp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
with_summaries <- subset(benetech, nchar(summary) > 0) | ||
with_english_summaries <- subset(with_summaries, language == "en") | ||
write.csv(with_english_summaries$summary, "with-summaries.csv", col.names = FALSE, row.names = FALSE, quote=FALSE) | ||
|
||
# then use | ||
# cat with-summaries.csv | tr " " "\n" | sort | uniq | ||
# then invoke common_words.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# count of accounts that were only active for 1 day | ||
# by organization / magic word group / server and grand total |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# average account longevity | ||
# look at the distribution of date.authorized vs. (date.last.saved - date.authorized) | ||
|
||
# loading in a small dataset rather than the whole thing for now | ||
require("plyr") | ||
require("lattice") | ||
require("xtable") | ||
|
||
source("load_data.R") | ||
|
||
# 'working data frame' | ||
wdf <- with(benetech, data.frame( | ||
public_code = public.code, | ||
authorized_date = date.authorized, | ||
last_saved_date = date.last.saved)) | ||
|
||
# getting time differences | ||
wdf$authorized_date_lt <- as.POSIXlt(wdf$authorized_date) | ||
wdf$time_diff <- with(wdf, last_saved_date - authorized_date) | ||
wdf$time_diff_int <- as.integer(wdf$time_diff) | ||
|
||
# getting the month, year, quarter and halves | ||
wdf <- transform(wdf, | ||
authorized_date_month = authorized_date_lt$mon, | ||
authorized_date_year = authorized_date_lt$year + 1900) | ||
|
||
wdf$time_group_quarters <- paste(wdf$authorized_date_year, | ||
"Q", ceiling(wdf$authorized_date_month / 4), | ||
sep="") | ||
|
||
wdf$time_group_halves <- paste(wdf$authorized_date_year, | ||
"H", ceiling(wdf$authorized_date_month / 6), | ||
sep="") | ||
|
||
# there are NA's in date.authorized | ||
wdf_na_df <- data.frame(table(is.na(wdf$authorized_date))) | ||
colnames(wdf_na_df) <- c("is.na?", "count") | ||
|
||
# there are cases where date.last.saved - date.authorized is very negative | ||
# some show date.authorized as epoch… discarding those for now | ||
wdf_neg_time_diff <- data.frame(table(wdf$time_diff < 0)) | ||
colnames(wdf_neg_time_diff) <- c("negative time diff", "count") | ||
|
||
# working set excludes NA authorized dates and negative time diffs | ||
working_set <- wdf[!is.na(wdf$authorized_date) & wdf$time_diff > 0, ] | ||
working_set$time_group_quarters <- as.factor(working_set$time_group_quarters) | ||
working_set$time_group_halves <- as.factor(working_set$time_group_halves) | ||
|
||
# plots | ||
ggplot(data=working_set, aes(x=time_group_halves, y=time_diff_int/60/60/24)) + | ||
geom_point() + geom_boxplot() + opts(axis.text.x=theme_text(angle=-90)) | ||
|
||
# writing to csv | ||
summary_function <- function(df, colname) { | ||
column <- df[[colname]] | ||
return( | ||
data.frame( | ||
time_diff_min_seconds = min(column), | ||
time_diff_q2_seconds = as.numeric(quantile(column, 0.2)), | ||
time_diff_mean_seconds = mean(column), | ||
time_diff_median_seconds = median(column), | ||
time_diff_q8_seconds = as.numeric(quantile(column, 0.8)), | ||
time_diff_max_seconds = max(column) | ||
)) | ||
} | ||
|
||
|
||
# without summarizing by account | ||
by_quarter <- ddply(working_set, .(time_group_quarters), summary_function, "time_diff_int") | ||
by_half_year <- ddply(working_set, .(time_group_halves), summary_function, "time_diff_int") | ||
by_year <- ddply(working_set, .(authorized_date_year), summary_function, "time_diff_int") | ||
|
||
write.csv(by_half_year, file="longevity_summary_by_half_year.csv") | ||
|
||
# let's examine by accounts. Do some account have much higher longevity? | ||
# what's the count by account? we should remove the ones with low counts… | ||
by_account <- ddply(working_set, .(public_code), nrow) | ||
colnames(by_account) <- c("public_code", "count") | ||
|
||
by_account_quarter <- ddply(working_set, .(public_code, time_group_quarters), summary_function, "time_diff_int") | ||
by_account_quarter <- join(by_account, by_account_quarter) | ||
|
||
ggplot(data=by_account_quarter, aes(x=count, y=time_diff_median_seconds/60/60)) + | ||
geom_point() + coord_trans(x="log10") + opts(axis.text.x=theme_text(angle=-90)) | ||
|
||
ggplot(data=by_account_quarter, aes(x=count, y=time_diff_mean_seconds/60/60)) + | ||
geom_point() + geom_boxplot(aes(group=round_any(log10(count), 1))) + | ||
opts(axis.text.x=theme_text(angle=-90)) | ||
|
||
# huge range for accounts who use this once, pretty stead for people who use this more | ||
|
||
# group=round_any(log10(by_account_quarter$count), 1) | ||
# with(working_set, plot(time_diff ~ authorized_date)) | ||
# xyplot(time_diff ~ authorized_date, data=working_set, scales="free") | ||
# | ||
# ggplot(data=working_set, aes(x=authorized_date, y=time_diff, group=round_a)) + geom_boxplot() | ||
|
||
|
||
# graveyard | ||
# wdf$authorized_date_group <- apply(wdf$authorized_date_lt, get_date_group_2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import numpy | ||
|
||
word_to_count = {} | ||
for line in body: | ||
count_and_word = line.lstrip().rstrip().split(" ") | ||
count = count_and_word[0] | ||
if len(count_and_word) == 2: | ||
word = count_and_word[1] | ||
else: | ||
word = "" | ||
word_to_count[word] = count | ||
|
||
word_to_count.values | ||
numpy.array(word_to_count.keys())[numpy.argsort(word_to_count.values())][-50:] | ||
# array(['formed', 'Thanpyuzayart', 'ya', 'fruit', '20', 'following', | ||
# 'Division', 'Time)', 'what', 'Operation', '2007', 'endure', 'been', | ||
# 'leader', 'most', 'demanded', 'participation', 'physical', 'arrest', | ||
# 'should', 'tried', 'Kaw', 'when', 'as', 'Nam', '2011', 'taking', | ||
# 'place', 'sent', 'Zaw', 'Tun', 'over', 'Namkham', 'Tin', 'health', | ||
# 'outside', 'I', 'While', 'Moe', 'Lay', 'prisons', 'Three', | ||
# 'further', 'reported', '(First', 'according', "don't", 'If', 'rice', | ||
# 'and'], | ||
# dtype='|S78') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# parses the files into (file_name, public_id, bulletin_id) | ||
# to be used after | ||
# wget -l 1 -m -c "https://martus.ceu.hu/servlet/DoSearch?typeOfSearch=quickSearchAll" | ||
import re | ||
import csv | ||
import os | ||
|
||
out_file = file("url_public_id.csv", "w") | ||
|
||
for file_name in os.listdir("."): | ||
if (re.compile("^FoundBulletin").match(file_name)): | ||
id = file_name.split("FoundBulletin?index=")[1].split("&")[0] | ||
try: | ||
body = file(file_name, "r").read() | ||
title = body.split("<strong>")[1].split("</strong>")[0] | ||
public_id = body.split("<!--Account Public Code = ")[1].split(" -->")[0] | ||
bulletin_id = body.split("<!--Bulletin Local Id = ")[1].split(" -->")[0] | ||
out_file.write("%s,%s,%s\n" % (title, public_id, bulletin_id)) | ||
except: | ||
print "Cannot parse %s" % file_name | ||
|
||
out_file.close() | ||
|
||
|
||
for link in br.links(url_regex="FoundBulletin*"): | ||
response = br.follow_link(link) | ||
html_response = response.read() | ||
title = html_response.split("<strong>")[1].split("</strong>")[0] | ||
public_id = html_response.split("<!--Account Public Code = ")[1].split(" -->")[0] | ||
bulletin_id = html_response.split("<!--Bulletin Local Id = ")[1].split(" -->")[0] | ||
print (title, public_id, bulletin_id) | ||
print(html_response) | ||
out_file.write("%s,%s,%s\n" % (title, public_id, bulletin_id)) | ||
|
||
|
||
out_file.close | ||
|
Oops, something went wrong.