N2ITN · lockefox · Mar 18, 2018 · Mar 18, 2018 · Mar 18, 2018 · Mar 18, 2018
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,7 @@
+[report]
+    omit = 
+
+[paths]
+    source = 
+        web
+        get_process_data
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -0,0 +1,3 @@
+Zachary A Estela (@N2ITN)
+(@Carrie0302)
+John Purcell (@lockefox)
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,23 @@
+FROM revolutionsystems/python:3.6.3-wee-optimized-lto
+
+# Default starting args
+ARG PORT=5000
+ENV PORT=$PORT
+ENV GUNICORN_WORKERS=16
+ENV GUNICORN_BIND=0.0.0.0:$PORT
+ENV GUNICORN_TIMEOUT=28
+# ENV GUNICORN_PRELOAD_APP=true
+# ENV GUNICORN_WORKER_CLASS=gaiohttp
+
+# Build project 
+COPY "" /opt/fakenews/
+RUN pip install /opt/fakenews
+RUN pip install gunicorn
+
+# Setup filepaths
+WORKDIR /opt/fakenews/web
+RUN generate_gunicorn_conf
+EXPOSE $PORT
+
+# Start the party
+ENTRYPOINT gunicorn --config gunicorn.conf app:APP
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include README.rst
+include LICENSE
+include-recursive get_process_data/opensources
diff --git a/README.md → README.rst b/README.md → README.rst
@@ -1,44 +1,50 @@
-# Fake News Detector
+==================
+Fake News Detector
+==================
 
 In an era increasingly defined by the proliferation of misinformation and polarized politics, it's important for internet users to have context for what's on their screen. This microservice uses natural language processing and deep learning to analyze patterns of bias on any news website in real time. Each time a url is submitted, dozens of the most recent articles are collected and analyzed for a variety of factors, from political bias to journalistic accuracy.
 
 
+How it works
+============
 
-# How it works
+Data Collection
+---------------
 
-## Data Collection
+`OpenSources`_ maintains a downloadable database of news sites with tags related to journalistic accuracy.
 
----
+`Media Bias Fact Check`_ maintains an online directory of news sites, categorized by the political bias and accuracy.
 
-[OpenSources](http://www.opensources.co/) maintains a downloadable database of news sites with tags related to journalistic accuracy.
+Using a customized fork of the excellent `Newspaper`_ library this project spiders ~3000 labelled websites for new articles to and stores them by their bias tag in MongoDB. Article texts are minmally preprocessed with unicode cleaning.
 
-[Media Bias Fact Check](https://mediabiasfactcheck.com/) maintains an online directory of news sites, categorized by the political bias and accuracy.
-
-Using a customized fork of the excellent [Newspaper](https://github.com/codelucas/newspaper) library this project spiders ~3000 labelled websites for new articles to and stores them by their bias tag in MongoDB. Article texts are minmally preprocessed with unicode cleaning.
-
-## Modeling
-
----
+Modeling
+--------
 
 Using the collected data, a TFIDF vector is fitted on the article collection. A custom-built convolutional neural network is trained in a multi-label classification scheme using a binary crossentropy loss fucntion with a sigmoid output layer. Th model is deployed to AWS Lambda.
 
-## Deployment
-
----
+Deployment
+----------
 
 The website is published via Flask. After a user enters a news site URL, the webserver scans the site for the most 150 recent articles and gathers their URLS. Asynchronously, the text in each url is downloaded using AWS Lambda. The article text is then sent to another AWS Lambda function with the trained neural network model. Results are plotted via matplotlib and rendered in the webpage.
 
-## Deeper
-
----
+Deeper
+------
 
 For a much more detailed discussion of the project please see this living presentation on google slides: https://docs.google.com/presentation/d/1wwnTx0hKB2MJXGPBHbAzElQnCPKH4UFicfnrzsxQG2g/edit?usp=sharing
 
 
-## Open Source
+Open Source
+-----------
+
 This is GNU GPL licensed, so anyone can use it as long as it remains open source. 
 Anyone who is interested in contributing is welcome to head over to the Data For Democracy repo, where issues are being tracked.
 https://github.com/Data4Democracy/are-you-fake-news
 
-## Contact
+Contact
+-------
+
 aracel.io
+
+.. _`OpenSources`: http://www.opensources.co/
+.. _`Media Bias Fact Check`: https://mediabiasfactcheck.com/
+.. _`Newspaper`: https://github.com/codelucas/newspaper
diff --git a/get_process_data/__init__.py b/get_process_data/__init__.py
diff --git a/get_process_data/helpers.py b/get_process_data/helpers.py
@@ -1,14 +1,14 @@
 """
 This module contains some auxilliary functions
 
-The most important include: 
+The most important include:
     * lemmaTokenizer: performs NLP preprocessing
     * AddDict: allows the values of two dictionaries to be added by matching keys
     * timeit: prints function exectution time to stdout
 
 
 
-TODO: This module is duplicated in 3 different forms: 
+TODO: This module is duplicated in 3 different forms:
         A version exists in ./web, an identical version in ./get_process_data
         This was to avoid local imports, but is dangerous
         A third version exists as helpers_nlp and is stripped down for AWS Lambda
@@ -31,7 +31,6 @@
 def j_writer(f, silent=False):
 
     def wrapper(*args, **kwargs):
-
         res = f(*args)
         if not res: return
         _j, name = res
@@ -47,13 +46,13 @@ def wrapper(*args, **kwargs):
 
 
 class addDict(dict):
-    ''' provides an 'add' method to dictionaries '''
+    """provides an 'add' method to dictionaries"""
 
     def __iadd__(self, b):
         return self + b
 
     def __add__(self, b):
-        ''' magic method override'''
+        """magic method override"""
         # Only works if b is a dictionary
         if isinstance(b, dict):
             a_key = set(self.keys())

diff --git a/get_process_data/join_source_lists.py b/get_process_data/join_source_lists.py
@@ -3,19 +3,19 @@
 from mediabiasfactcheck.com into one Mongo table, merging similar tags.
 """
 import json
-import mongo_driver
 from pprint import pprint
-from helpers import addDict
 
+from helpers import addDict
+import mongo_driver
 
 def transform_open_format(x):
-    ''' Original format:
+    """Original format:
         (u'NutritionalAnarchy.com',
         {u'2nd type': u'',
         u'3rd type': u'',
         u'Source Notes (things to know?)': u'',
         u'type': u'unreliable'})
-    '''
+    """
 
     urls = mongo_driver.get_url('opensources')
     if x[0] in urls:

diff --git a/get_process_data/labels_MBFC.py b/get_process_data/labels_MBFC.py
@@ -1,5 +1,5 @@
-""" 
-Scrapes the website bias labels from mediabiasfactcheck.com 
+"""
+Scrapes the website bias labels from mediabiasfactcheck.com
 and puts the results into a mongodb table
 """
 
@@ -10,6 +10,7 @@
 from time import sleep
 import string
 import httplib2
+
 import requests
 from bs4 import BeautifulSoup, SoupStrainer
 
@@ -65,7 +66,6 @@ def get_page(self, link):
             return page
 
     def get_tag(self):
-
         try:
             tag_ = BeautifulSoup(requests.get(self.page).text, 'html.parser').find_all(
                 class_='entry-content')
@@ -104,7 +104,6 @@ def clean(text_, key):
         pprint(results)
 
     def export_results(self):
-
         self.results.update({'Reference': self.page, 'Category': accumulator.cat})
         print(self.results)
 
@@ -125,5 +124,5 @@ def cat_json():
 '''
 TODO:
     Add threadpool
-    Make better variables and less hacky error handling    
+    Make better variables and less hacky error handling
 '''
diff --git a/get_process_data/lemmatize_articles.py b/get_process_data/lemmatize_articles.py
@@ -1,19 +1,18 @@
 """ This cleans all the scraped articles  """
 
+import json
+
 from helpers import LemmaTokenizer
 import mongo_driver
-import json
 
 
 def lemma_wrapper(dict_):
-
     dict_['article'] = LemmaTokenizer(dict_['text'])
     dict_.pop('text')
     return dict_
 
 
 def flags_articles_gen():
-
     for i, _ in enumerate(mongo_driver.get_all('articles')):
         yield _
 

diff --git a/get_process_data/mongo_driver.py b/get_process_data/mongo_driver.py
@@ -1,5 +1,5 @@
-""" 
-Mongo handler used in other code in this directory. 
+"""
+Mongo handler used in other code in this directory.
 Contains set of wrapper functions for interacting with mongo efficiently and consistently.
 """
 
@@ -26,24 +26,26 @@ def get_url(table_name):
 
 
 def flag_counts():
+    # pylint: disable=C4001
+    # keep mongo queries copy/pastable to mongo
     db_out = list(
         db.articles.aggregate([{
-            '$unwind': "$flags"
+            "$unwind": "$flags"
         }, {
-            '$group': {
-                '_id': {
-                    '$toLower': '$flags'
+            "$group": {
+                "_id": {
+                    "$toLower": "$flags"
                 },
-                'count': {
-                    '$sum': 1
+                "count": {
+                    "$sum": 1
                 }
             }
         }, {
-            '$sort': {
-                'count': -1
+            "$sort": {
+                "count": -1
             }
         }, {
-            '$limit': 100
+            "$limit": 100
         }]))
 
     d = dict()

diff --git a/get_process_data/webcrawler.py b/get_process_data/webcrawler.py
@@ -14,11 +14,13 @@
 import os
 from multiprocessing.dummy import Pool
 from time import sleep
-import mongo_driver
+
 import newspaper
 from fake_useragent import UserAgent
 import requests
 
+import mongo_driver
+
 os.environ['TLDEXTRACT_CACHE'] = '~/tldextract.cache'
 
 config = newspaper.Config()
@@ -121,7 +123,6 @@ def go(source):
 
 
 def threadpool(batch):
-
     with Pool(batch_size) as pool:
 
         x = pool.imap_unordered(go, batch)
@@ -164,9 +165,11 @@ def get_batch(batch_size):
     #             'size': mongo_driver.db['all_sources'].count()
     #         }
     #     }], allowDiskUse=True)
+    # pylint: disable=C4001
+    # keep mongo queries copy/pastable to mongo
     news_sources = mongo_driver.db['all_sources'].find({
-        'Category': {
-            "$in": ['extreme left', 'satire', 'hate', 'pro-science', 'very high', 'low', 'right']
+        "Category": {
+            "$in": ["extreme left", "satire", "hate", "pro-science", "very high", "low", "right"]
         }
     })
     # news_sources = list(mongo_driver.db['all_sources'].find())

diff --git a/Untitled1.ipynb → notebooks/Untitled1.ipynb b/Untitled1.ipynb → notebooks/Untitled1.ipynb
diff --git a/web/verbal_score_interpretation.ipynb → notebooks/verbal_score_interpretation.ipynb b/web/verbal_score_interpretation.ipynb → notebooks/verbal_score_interpretation.ipynb
diff --git a/web/app_.py → scraps/app_.py b/web/app_.py → scraps/app_.py
diff --git a/web/test.py → scraps/test.py b/web/test.py → scraps/test.py
diff --git a/web/user_count → scraps/user_count b/web/user_count → scraps/user_count