Skip to content
This repository has been archived by the owner on Mar 25, 2019. It is now read-only.

Commit

Permalink
Initial
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Warner committed Jan 2, 2014
0 parents commit 0c86123
Show file tree
Hide file tree
Showing 82 changed files with 1,734 additions and 0 deletions.
19 changes: 19 additions & 0 deletions .gitignore
@@ -0,0 +1,19 @@
# See https://help.github.com/articles/ignoring-files for more about ignoring files.
#
# If you find yourself ignoring temporary files generated by your text editor
# or operating system, you probably want to add a global ignore instead:
# git config --global core.excludesfile '~/.gitignore_global'

# Ignore bundler config.
/.bundle

# Ignore the default SQLite database.
/db/*.sqlite3
/db/*.sqlite3-journal

# Ignore all logfiles and tempfiles.
/log/*.log
/tmp
vendor/all_urls.txt
vendor/domain_lookups.csv
.env
1 change: 1 addition & 0 deletions .rspec
@@ -0,0 +1 @@
--color
1 change: 1 addition & 0 deletions .ruby-gemset
@@ -0,0 +1 @@
link_scraper
1 change: 1 addition & 0 deletions .ruby-version
@@ -0,0 +1 @@
ruby-2.0.0-p195
35 changes: 35 additions & 0 deletions Gemfile
@@ -0,0 +1,35 @@
source 'https://rubygems.org'

gem "decent_exposure"
gem "decent_generators"
gem 'dotenv-rails'
gem "haml"
gem "haml-rails"
gem "librato-logreporter"
gem "pg"
gem "pry"
gem "pry-rails"
gem "twitter-bootstrap-rails"
gem 'addressable', require: 'addressable/uri'
gem 'coffee-rails', '~> 4.0.0'
gem 'jbuilder', '~> 1.2'
gem 'jquery-rails'
gem 'nokogiri'
gem 'rails'
gem 'sass-rails', '~> 4.0.0'
gem 'stringex'
gem 'turbolinks'
gem 'typhoeus'
gem 'uglifier', '>= 1.3.0'
gem 'whois'

group :test, :development do
gem "factory_girl"
gem "fivemat"
gem "rspec-rails"
gem "rspec"
end

group :test do
gem "shoulda-matchers"
end
193 changes: 193 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,193 @@
GEM
remote: https://rubygems.org/
specs:
actionmailer (4.0.2)
actionpack (= 4.0.2)
mail (~> 2.5.4)
actionpack (4.0.2)
activesupport (= 4.0.2)
builder (~> 3.1.0)
erubis (~> 2.7.0)
rack (~> 1.5.2)
rack-test (~> 0.6.2)
activemodel (4.0.2)
activesupport (= 4.0.2)
builder (~> 3.1.0)
activerecord (4.0.2)
activemodel (= 4.0.2)
activerecord-deprecated_finders (~> 1.0.2)
activesupport (= 4.0.2)
arel (~> 4.0.0)
activerecord-deprecated_finders (1.0.3)
activesupport (4.0.2)
i18n (~> 0.6, >= 0.6.4)
minitest (~> 4.2)
multi_json (~> 1.3)
thread_safe (~> 0.1)
tzinfo (~> 0.3.37)
addressable (2.3.5)
arel (4.0.1)
atomic (1.1.14)
builder (3.1.4)
coderay (1.1.0)
coffee-rails (4.0.1)
coffee-script (>= 2.2.0)
railties (>= 4.0.0, < 5.0)
coffee-script (2.2.0)
coffee-script-source
execjs
coffee-script-source (1.6.3)
decent_exposure (2.3.0)
decent_generators (0.0.1)
rails (~> 4.0.0)
diff-lcs (1.2.5)
dotenv (0.9.0)
dotenv-rails (0.9.0)
dotenv (= 0.9.0)
erubis (2.7.0)
ethon (0.6.2)
ffi (>= 1.3.0)
mime-types (~> 1.18)
execjs (2.0.2)
factory_girl (4.3.0)
activesupport (>= 3.0.0)
ffi (1.9.3)
fivemat (1.2.1)
haml (4.0.4)
tilt
haml-rails (0.5.3)
actionpack (>= 4.0.1)
activesupport (>= 4.0.1)
haml (>= 3.1, < 5.0)
railties (>= 4.0.1)
hike (1.2.3)
i18n (0.6.9)
jbuilder (1.5.3)
activesupport (>= 3.0.0)
multi_json (>= 1.2.0)
jquery-rails (3.0.4)
railties (>= 3.0, < 5.0)
thor (>= 0.14, < 2.0)
json (1.8.1)
librato-logreporter (0.2.1)
mail (2.5.4)
mime-types (~> 1.16)
treetop (~> 1.4.8)
method_source (0.8.2)
mime-types (1.25.1)
mini_portile (0.5.2)
minitest (4.7.5)
multi_json (1.8.2)
nokogiri (1.6.1)
mini_portile (~> 0.5.0)
pg (0.17.1)
polyglot (0.3.3)
pry (0.9.12.4)
coderay (~> 1.0)
method_source (~> 0.8)
slop (~> 3.4)
pry-rails (0.3.2)
pry (>= 0.9.10)
rack (1.5.2)
rack-test (0.6.2)
rack (>= 1.0)
rails (4.0.2)
actionmailer (= 4.0.2)
actionpack (= 4.0.2)
activerecord (= 4.0.2)
activesupport (= 4.0.2)
bundler (>= 1.3.0, < 2.0)
railties (= 4.0.2)
sprockets-rails (~> 2.0.0)
railties (4.0.2)
actionpack (= 4.0.2)
activesupport (= 4.0.2)
rake (>= 0.8.7)
thor (>= 0.18.1, < 2.0)
rake (10.1.1)
rspec (2.14.1)
rspec-core (~> 2.14.0)
rspec-expectations (~> 2.14.0)
rspec-mocks (~> 2.14.0)
rspec-core (2.14.7)
rspec-expectations (2.14.4)
diff-lcs (>= 1.1.3, < 2.0)
rspec-mocks (2.14.4)
rspec-rails (2.14.0)
actionpack (>= 3.0)
activesupport (>= 3.0)
railties (>= 3.0)
rspec-core (~> 2.14.0)
rspec-expectations (~> 2.14.0)
rspec-mocks (~> 2.14.0)
sass (3.2.13)
sass-rails (4.0.1)
railties (>= 4.0.0, < 5.0)
sass (>= 3.1.10)
sprockets-rails (~> 2.0.0)
shoulda-matchers (2.4.0)
activesupport (>= 3.0.0)
slop (3.4.7)
sprockets (2.10.1)
hike (~> 1.2)
multi_json (~> 1.0)
rack (~> 1.0)
tilt (~> 1.1, != 1.3.0)
sprockets-rails (2.0.1)
actionpack (>= 3.0)
activesupport (>= 3.0)
sprockets (~> 2.8)
stringex (2.1.2)
thor (0.18.1)
thread_safe (0.1.3)
atomic
tilt (1.4.1)
treetop (1.4.15)
polyglot
polyglot (>= 0.3.1)
turbolinks (2.1.0)
coffee-rails
twitter-bootstrap-rails (2.2.8)
actionpack (>= 3.1)
execjs
rails (>= 3.1)
railties (>= 3.1)
typhoeus (0.6.7)
ethon (~> 0.6.2)
tzinfo (0.3.38)
uglifier (2.4.0)
execjs (>= 0.3.0)
json (>= 1.8.0)
whois (3.4.2)

PLATFORMS
ruby

DEPENDENCIES
addressable
coffee-rails (~> 4.0.0)
decent_exposure
decent_generators
dotenv-rails
factory_girl
fivemat
haml
haml-rails
jbuilder (~> 1.2)
jquery-rails
librato-logreporter
nokogiri
pg
pry
pry-rails
rails
rspec
rspec-rails
sass-rails (~> 4.0.0)
shoulda-matchers
stringex
turbolinks
twitter-bootstrap-rails
typhoeus
uglifier (>= 1.3.0)
whois
1 change: 1 addition & 0 deletions Procfile
@@ -0,0 +1 @@
worker: bundle exec rake pages:work
26 changes: 26 additions & 0 deletions README.md
@@ -0,0 +1,26 @@
### Rap Genius Trackback Scraper

This is the tool we used to scrape 178k URLs in 15 minutes in order to find which pages were hosting potentially spammy Rap Genius links. Given a list of URLs to scrape, it creates aggregate information that identifies the spammiest sites for manual review.

For more details on the motivation and background for this repository, check out [the blog post on Rap Genius](http://news.rapgenius.com/Rap-genius-founders-rap-genius-is-back-on-google-lyrics)

### Setup

You can run the scrape process using a set of sample data in vendor/urls.txt. To get started:

```sh
$ bundle install && rake db:create db:migrate urls:import
$ gem install foreman
$ mkdir tmp
$ foreman start worker
```

Then, once the pages have all been scraped (i.e., `Page.unscraped.count == 0`):

```ruby
# from the console
Page.write_report!
```

### License
MIT
6 changes: 6 additions & 0 deletions Rakefile
@@ -0,0 +1,6 @@
# Add your own tasks in files placed in lib/tasks ending in .rake,
# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.

require File.expand_path('../config/application', __FILE__)

TrackbackScraper::Application.load_tasks
Empty file added app/assets/images/.keep
Empty file.
17 changes: 17 additions & 0 deletions app/assets/javascripts/application.js
@@ -0,0 +1,17 @@
// This is a manifest file that'll be compiled into application.js, which will include all the files
// listed below.
//
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
// or vendor/assets/javascripts of plugins, if any, can be referenced here using a relative path.
//
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
// compiled file.
//
// Read Sprockets README (https://github.com/sstephenson/sprockets#sprockets-directives) for details
// about supported directives.
//
//= require jquery
//= require jquery_ujs
//= require twitter/bootstrap
//= require turbolinks
//= require_tree .
3 changes: 3 additions & 0 deletions app/assets/javascripts/bootstrap.js.coffee
@@ -0,0 +1,3 @@
jQuery ->
$("a[rel~=popover], .has-popover").popover()
$("a[rel~=tooltip], .has-tooltip").tooltip()
3 changes: 3 additions & 0 deletions app/assets/javascripts/static.js.coffee
@@ -0,0 +1,3 @@
# Place all the behaviors and hooks related to the matching controller here.
# All this logic will automatically be available in application.js.
# You can use CoffeeScript in this file: http://coffeescript.org/
13 changes: 13 additions & 0 deletions app/assets/stylesheets/application.css
@@ -0,0 +1,13 @@
/*
* This is a manifest file that'll be compiled into application.css, which will include all the files
* listed below.
*
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
* or vendor/assets/stylesheets of plugins, if any, can be referenced here using a relative path.
*
* You're free to add application-wide styles to this file and they'll appear at the top of the
* compiled file, but it's generally better to create a new file per style scope.
*
*= require_self
*= require_tree .
*/
7 changes: 7 additions & 0 deletions app/assets/stylesheets/bootstrap_and_overrides.css
@@ -0,0 +1,7 @@
/*
=require twitter-bootstrap-static/bootstrap
Use Font Awesome icons (default)
To use Glyphicons sprites instead of Font Awesome, replace with "require twitter-bootstrap-static/sprites"
=require twitter-bootstrap-static/fontawesome
*/
2 changes: 2 additions & 0 deletions app/assets/stylesheets/global.sass
@@ -0,0 +1,2 @@
.container-fluid
margin-top: 50px
3 changes: 3 additions & 0 deletions app/assets/stylesheets/static.css.scss
@@ -0,0 +1,3 @@
// Place all the styles related to the static controller here.
// They will automatically be included in application.css.
// You can use Sass (SCSS) here: http://sass-lang.com/
5 changes: 5 additions & 0 deletions app/controllers/application_controller.rb
@@ -0,0 +1,5 @@
class ApplicationController < ActionController::Base
# Prevent CSRF attacks by raising an exception.
# For APIs, you may want to use :null_session instead.
protect_from_forgery with: :exception
end
Empty file added app/controllers/concerns/.keep
Empty file.
2 changes: 2 additions & 0 deletions app/controllers/static_controller.rb
@@ -0,0 +1,2 @@
class StaticController < ApplicationController
end
2 changes: 2 additions & 0 deletions app/helpers/application_helper.rb
@@ -0,0 +1,2 @@
module ApplicationHelper
end
2 changes: 2 additions & 0 deletions app/helpers/static_helper.rb
@@ -0,0 +1,2 @@
module StaticHelper
end
12 changes: 12 additions & 0 deletions app/helpers/string_helper.rb
@@ -0,0 +1,12 @@
module StringHelper
extend self

def coerce_to_utf8(input)
output = input.dup.force_encoding("UTF-8")

return output if output.valid_encoding?

output = output.force_encoding("BINARY")
output.encode("UTF-8", invalid: :replace, undef: :replace)
end
end
Empty file added app/mailers/.keep
Empty file.

0 comments on commit 0c86123

Please sign in to comment.