# Indicators for Chronic Disease Surveillance

## Authors

Juan Luis Onieva Zafra

Jesús Gómez Sola

Paloma Domínguez Sánchez

## Abstract

In this proyect we present an analysis of data of indicators of chronic diseases that are provided in the data.gov portal at the address <https://catalog.data.gov/dataset/u-s-chronic-disease-indicators-cdi>, from where we have downloaded the CSV file. 

<img src="MMWR.png" width="600">

The objective of the task is to use Spark to obtain various queries and represent them in a table and graph format. For this purpose, we are going to work with two different APIs: RDDs and datasets.

Our work is divided into: 

- **Tests**:

    *__init__.py*

    *AnalysisTest.py*

    *ReadCSVTest.py*

- **CDI**:

    *ReadCSV.py*
    
    *Analysis.py*
        
    *cdi.py*
    
    
- LICENSE

- README.md

- requirements.txt

- setup.py


## CDI

#### ReadCSV.py

This class is in charge of reading the CSV file format as RDD or data_frame. For this purpose the class create a spark_session object which read a file in format 'CSV' that is passed as parameter.

We can see defined two functions:


*def read_csv_with_data_frame(file_csv: str) -> DataFrame*

*def read_csv_with_rdd(file_csv: str) -> SparkContext*





In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException
from py4j.protocol import Py4JError
from pyspark import SparkConf, SparkContext
import os
import sys


def read_csv_with_data_frame(file_csv: str) -> DataFrame:
    spark_session = SparkSession \
        .builder \
        .getOrCreate()

    logger = spark_session._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)

    try:
        data_frame = spark_session\
            .read\
            .format("csv") \
            .options(header='true', inferschema='true')\
            .load(file_csv)
    except Py4JError:
        raise AnalysisException('There is no csv file in:'  + str(os.path))

    return data_frame


def read_csv_with_rdd(file_csv: str) -> SparkContext:
    spark_conf = SparkConf()
    spark_context = SparkContext(conf=spark_conf)

    logger = spark_context._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)

    rdd = spark_context \
        .textFile(file_csv)
    header = rdd.first()
    return rdd

    rdd = rdd.filter(lambda row: row!=header) \
        .map(lambda line: line.split(",")) \
        .map(lambda line: (line[5], line[6])) \
        .distinct() \
        .map(lambda list: (list[0], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .sortBy(lambda pair: pair[0]) \
        .collect()
    # .map(lambda line: (line[0], (line[1], 1))) \
    # .reduceByKey(lambda x, y: x + y) \
    spark_context.stop()
    print(rdd)
    return rdd


#### Analysis.py
This class has defined several descriptive functions responsible for executing the corresponding queries:

" def get_data_frame_count_type_of_topic(data_frame: DataFrame) -> DataFrame " returns the number of type of diseases. 

" def get_data_frame_count_male_gender_by_topic(data_frame: DataFrame) -> DataFrame " returns the number of men that has each disease.

" def get_data_frame_count_black_ethnicity_by_topic(data_frame: DataFrame) -> DataFrame " returns the number of black ethnicity people that has each disease:


The last function is responsible for graphically representing previously defended functions:

def plot_type_of_topic(data_frame: DataFrame) -> None


In [13]:
import matplotlib.pyplot as plt
from pyspark.sql import DataFrame
import pandas


def get_data_frame_count_type_of_topic(data_frame: DataFrame) -> DataFrame:
    data_frame_topic = data_frame \
        .select("TopicID", "Question") \
        .distinct() \
        .groupBy("TopicID") \
        .count() \
        .sort("TopicID")

    print("The following table represent the number of the type of each topic")
    data_frame_topic.show()
    return data_frame_topic

def get_data_frame_count_male_gender_by_topic(data_frame: DataFrame) -> DataFrame:
    data_frame_topic = data_frame \
        .filter(data_frame["Stratification1"].contains("Male")) \
        .distinct() \
        .groupBy("TopicID") \
        .count() \
        .sort("TopicID")

    print("The following table represent the number of men group by the topic: ")
    data_frame_topic.show()
    return data_frame_topic

def get_data_frame_count_black_ethnicity_by_topic(data_frame: DataFrame) -> DataFrame:
    data_frame_topic = data_frame \
        .filter(data_frame["Stratification1"].contains("Black, non-Hispanic")) \
        .distinct() \
        .groupBy("TopicID") \
        .count() \
        .sort("TopicID")

    print("The following table represent the number of black ethnicity people group by the topic: ")
    data_frame_topic.show()
    return data_frame_topic


def plot_type_of_topic(data_frame: DataFrame) -> None:
    data_frame_pandas = data_frame.toPandas()
    for row in data_frame_pandas:
        print(row)
    print(data_frame_pandas)
    plt.interactive(False)
    plt.figure()
    data_frame_pandas.plot(kind='bar', x=data_frame_pandas['TopicID'])
    plt.show()

#### cdi.py

This is the MAIN class of the proyect which is in charge of join the rest of classes. 

We define a data_frame object for each query and for each of them we call the function "plot_type_of_topic" to represent the data.

In [16]:
file_csv = 'Chronic_Disease_Indicators_CDI.csv'
#file_csv = 'data/cdi.csv'
data_frame = read_csv_with_data_frame(file_csv)
data_frame_count_type = get_data_frame_count_type_of_topic(data_frame)
plot_type_of_topic(data_frame_count_type)


AnalysisException: 'Path does not exist: file:/Users/Paloma/Documents/bitbucket/pycdipab2018/Chronic_Disease_Indicators_CDI.csv;'

## Tests

#### AnalysisTest.py

In [17]:
import unittest
from ReadCSV import read_csv_with_data_frame
from Analysis import get_data_frame_count_type_of_topic
from pyspark.sql.utils import AnalysisException


class MyTestCase(unittest.TestCase):

    def setUp(self):
        self.data_frame = read_csv_with_data_frame('data/pruebas.csv')
        self.data_frame_wrong = read_csv_with_data_frame('data/pruebas-wrong-column.csv')

    def test_when_count_subtopic_data_frame_should_have_at_least_columns_with_topic_and_subtopic(self):
        with self.assertRaises(AnalysisException):
            get_data_frame_count_type_of_topic(self.data_frame_wrong)

    def test_the_number_of_topic_must_be_correct(self):
        data_frame_topic = get_data_frame_count_type_of_topic(self.data_frame)
        total = data_frame_topic.count()
        expected_value = 3
        self.assertEqual(expected_value, total)

    def test_the_total_number_must_correspond_with_size_of_csv(self):
        data_frame_topic = get_data_frame_count_type_of_topic(self.data_frame)
        data_frame_pandas = data_frame_topic.toPandas()
        total = sum(data_frame_pandas['count'])
        expected_value = 6
        self.assertEqual(expected_value, total)



if __name__ == '__main__':
    unittest.main()

ModuleNotFoundError: No module named 'ReadCSV'

#### ReadCSVTest.py

In [18]:
import unittest
from ReadCSV import read_csv_with_data_frame
from pyspark.sql.utils import AnalysisException


class MyTestCase(unittest.TestCase):

    def test_read_csv_from_data_frame_read_correctly(self):
        data_frame = read_csv_with_data_frame('data/pruebas.csv')
        data_frame_total = data_frame \
            .count()
        expected_value = 12
        self.assertEqual(expected_value, data_frame_total)

    def test_raise_exception_when_the_file_is_not_csv(self):
        with self.assertRaises(AnalysisException):
            read_csv_with_data_frame('data/pruebas.tsv')

    def test_raise_exception_when_the_file_not_exist(self):
        with self.assertRaises(AnalysisException):
            read_csv_with_data_frame('data/no-file.tsv')


if __name__ == '__main__':
    unittest.main()

ModuleNotFoundError: No module named 'ReadCSV'