# Stats

Connect to the mondo database.

In [1]:
import pymongo

mongo_client = pymongo.MongoClient('mongodb://localhost:27017')

mongo_client.drop_database('stats')

print("MongoDB:", mongo_client.server_info)

MongoDB: <bound method MongoClient.server_info of MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)>


Create a stats database with a collection named `database` that contains the number of documents for each database's collections and how many attributes.

In [2]:
for database in mongo_client.list_database_names():
    if database not in ('admin','config','local','stats'):
        stats = { 'database': database }
        collections_stats = {}
        for collection in mongo_client[database].list_collection_names():
            collections_stats[collection] = {}
            collections_stats[collection]['records'] = mongo_client[database][collection].count_documents({})
            collections_stats[collection]['attributes'] = len(list(mongo_client[database][collection].find_one({}).keys()))-1
        stats['collections'] = collections_stats
        mongo_client['stats']['database'].insert_one(stats)

In the stats database create a new collection named `collection` that contains all the attributes for each database's collections.

In [3]:
for database in mongo_client.list_database_names():
    if database not in ('admin','config','local','stats'):
        for collection in mongo_client[database].list_collection_names():
            stats = { 
                'database': database,
                'collection': collection
            }
            attributes = mongo_client[database][collection].find_one({}).keys()
            stats['attributes'] = list(attributes)
            stats['attributes'].remove('_id')
            mongo_client['stats']['collection'].insert_one(stats)

For each attribute in all database's collections calculate statistics measures.

In [4]:
import csv
import numpy as np

def get_all_values_from_csv(file):
    values = {}

    with open(file) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            for attribute,value in row.items():
                if attribute != 'sampleId' and value != 'NA':
                    values.setdefault(attribute, []).append(float(value))
    return values

def insert_values_on_mongo(database, collection, values):
    documents = []
    for a,v in values.items():
        document = {
            'database': database,
            'collection': collection,
            'attribute': a,
            'values': v,
            'max_value': np.amax(v),
            'min_value': np.amin(v),
            'mean_value': np.mean(v),
            'median_value': np.median(v)
        }
        documents.append(document)
    mongo_client['stats']['statistics'].insert_many(documents)
    
# skin_cancer_db
print("Skin Cancer Database:")

print("\tCNA values ...", end="\r")
cna_values = get_all_values_from_csv('data/skin/cna.csv')
insert_values_on_mongo('skin_cancer_db', 'cna', cna_values)
print("\tCNA values (OK);")

print("\tMethylation values ...", end="\r")
methylation_values = get_all_values_from_csv('data/skin/methylation_hm450.csv')
insert_values_on_mongo('skin_cancer_db', 'methylation', methylation_values)
print("\tMethylation values (OK);")

print("\tRNA values ...", end="\r")
rna_values = get_all_values_from_csv('data/skin/rnaZscore.csv')
insert_values_on_mongo('skin_cancer_db', 'rna', rna_values)
print("\tRNA values (OK).")

# stomach_cancer_db
print("Stomach Cancer Database:")

print("\tCNA values ...", end="\r")
cna_values = get_all_values_from_csv('data/stomach/cna.csv')
insert_values_on_mongo('stomach_cancer_db', 'cna', cna_values)
print("\tCNA values (OK);")

print("\tMethylation values ...", end="\r")
methylation_values = get_all_values_from_csv('data/stomach/methylation_hm450.csv')
insert_values_on_mongo('stomach_cancer_db', 'methylation', methylation_values)
print("\tMethylation values (OK);")

print("\tRNA values ...", end="\r")
rna_values = get_all_values_from_csv('data/stomach/rnaZscore.csv')
insert_values_on_mongo('stomach_cancer_db', 'rna', rna_values)
print("\tRNA values (OK).")

# thyroid_cancer_db
print("Thyroid Cancer Database:")

print("\tCNA values ...", end="\r")
cna_values = get_all_values_from_csv('data/thyroid/cna.csv')
insert_values_on_mongo('thyroid_cancer_db', 'cna', cna_values)
print("\tCNA values (OK);")

print("\tMethylation values ...", end="\r")
methylation_values = get_all_values_from_csv('data/thyroid/methylation_hm450.csv')
insert_values_on_mongo('thyroid_cancer_db', 'methylation', methylation_values)
print("\tMethylation values (OK);")

print("\tRNA values ...", end="\r")
rna_values = get_all_values_from_csv('data/thyroid/rnaZscore.csv')
insert_values_on_mongo('thyroid_cancer_db', 'rna', rna_values)
print("\tRNA values (OK).")

Skin Cancer Database:
	CNA values ...
	CNA values (OK);
	Methylation values ...
	Methylation values (OK);
	RNA values ...
	RNA values (OK).
Stomach Cancer Database:
	CNA values (OK);
	Methylation values (OK);
	RNA values (OK).
Thyroid Cancer Database:
	CNA values (OK);
	Methylation values (OK);
	RNA values (OK).
