# Pulling & Loading Data Correctly

## Pulling Data from the Server

In [1]:
import numpy as np
import pandas as pd 
import requests
import json

data = {
    'filters': 'taxnodes:Technology|Information Technology|Artificial Intelligence|Cognitive Science@@semantic-units:arXiv.org',
    'fields': 'concept-tagsConf,cdid,taxnodesConf,modified,authorsRaw,title',
    'sort': 'title_sort asc',
    'limit': 2000,
    'offset': 0
}

response = requests.post('https://aitopics.org/i2kweb/webapi/search', data=data, auth=('aitopics-guest', 'HvGSauJ00COgRnGX'))

data = response.json()
df = pd.DataFrame(data)
df

Unnamed: 0,concept-tagsConf,cdid,taxnodesConf,modified,authorsRaw,title
0,"[diagnostic medicine::175.778, machine learnin...",arxivorg:CBBFE8A2,"[Genre|Personal|Interview::0.477154, Genre|Per...",2021-01-12T00:00:00Z,"[Wang, Dakuo, Wang, Liuping, Zhang, Zhan, Wang...","""Brilliant AI Doctor"" in Rural China: Tensions..."
1,"[machine learning::262.338, reinforcement lear...",arxivorg:3E2F104A,[Industry|Health & Medicine|Therapeutic Area::...,2019-08-15T00:00:00Z,"[Hoey, Jesse, MacKinnon, Neil J.]","""Conservatives Overfit, Liberals Underfit"": Th..."
2,"[machine learning::593.251, natural language::...",arxivorg:70E22CB3,"[Country|North America::1.0, Country|North Ame...",2017-11-14T00:00:00Z,"[Israelsen, Brett W, Ahmed, Nisar R]","""Dave...I can assure you...that it's going to ..."
3,"[machine learning::133.55, artificial intellig...",arxivorg:307AE56B,"[Genre|Research Report::1.0, Genre|Research Re...",2021-07-27T00:00:00Z,"[Lyons, Michael J.]","""Excavating AI"" Re-excavated: Debunking a Fall..."
4,"[machine learning::18.4734, artificial intelli...",arxivorg:07A12138,[Technology|Information Technology|Artificial ...,2019-03-15T00:00:00Z,"[Hu, Baogang, Dong, Weiming]","""Ge Shu Zhi Zhi"": Towards Deep Understanding a..."
...,...,...,...,...,...,...
1995,"[machine learning::77.0321, natural language::...",arxivorg:A1D7D468,"[Country|North America::1.0, Country|Europe::0...",2020-12-25T00:00:00Z,"[Chen, Jiangjie, Bao, Qiaoben, Chen, Jiaze, Su...",LOREN: Logic Enhanced Neural Reasoning for Fac...
1996,"[programming language::191.115, logic & formal...",arxivorg:F751E408,"[Industry|Information Technology::0.671877, In...",2020-08-15T00:00:00Z,"[Warren, David S., Liu, Yanhong A.]",LPOP: Challenges and Advances in Logic and Pra...
1997,"[machine learning::51.7569, artificial intelli...",arxivorg:5A13FBA1,[Technology|Information Technology|Artificial ...,2021-01-30T00:00:00Z,"[Fan, Weiquan, Xu, Xiangmin, Xing, Xiaofen, Ch...",LSSED: a large-scale dataset and benchmark for...
1998,"[machine learning::103.109, reinforcement lear...",arxivorg:4C9942CA,[Technology|Information Technology|Artificial ...,2021-02-12T00:00:00Z,"[Vaezipoor, Pashootan, Li, Andrew, Icarte, Rod...",LTL2Action: Generalizing LTL Instructions for ...


## Creating a MultiIndex of Tags

In [7]:
def format_value(value):
    end = value.find('::')
    return (value[0:end])

def get_MultiIndex(title, array):
    table_values = []
    
    for item in array: 
        item = format_value(item)
        table_values.append((title, item))
    return table_values

result = []
for index in df.index:
    cdid = df.get('cdid')[index]
    value = df.get('concept-tagsConf')[index]

    # print(value)
    if isinstance(value, list):
        print(value)
        for item in value:
            item = format_value(item)
            result.append((cdid, item))

multi = pd.DataFrame(result, columns=["ID", "Tag"])
multi

['diagnostic medicine::175.778', 'machine learning::175.778', 'consumer health::175.778', 'natural language::175.778', 'artificial intelligence::175.778', 'medical record::175.778', 'health & medicine::175.778', 'survey article::175.778', 'cardiology::175.778', 'us government::175.778', 'oncology::175.778', 'neural network::175.778', 'vascular disease::175.778', 'ai-cdss::169.294', 'information::55.1516', 'wang::47.134', 'yokohama::39.5094', 'proceedings::52.7537', 'clinician::175.778', 'workflow::41.5888', 'tension and challenge::39.5788', 'participant::71.9369', 'recommendation::58.0593', 'computing system::47.134', 'china::62.7681', 'ehr system::43.7491', 'diagnosis::57.1278', 'rural clinic::39.7585', 'brilliant ai doctor::38.2777', 'human factor::37.2736', 'ai-cdss system::97.6451']
['machine learning::262.338', 'reinforcement learning::262.338', "alzheimer's disease::262.338", 'artificial intelligence::262.338', 'health & medicine::262.338', 'neurology::262.338', 'survey article::

Unnamed: 0,ID,Tag
0,arxivorg:CBBFE8A2,diagnostic medicine
1,arxivorg:CBBFE8A2,machine learning
2,arxivorg:CBBFE8A2,consumer health
3,arxivorg:CBBFE8A2,natural language
4,arxivorg:CBBFE8A2,artificial intelligence
...,...,...
44642,arxivorg:0779E8F1,time sery
44643,arxivorg:0779E8F1,convergence rate
44644,arxivorg:0779E8F1,euler-maruyama method
44645,arxivorg:0779E8F1,α-stable lévy motion


In [None]:
multi.set_index(['ID', 'Tag'], inplace=True)
multi.sort_index(inplace=True)
multi

In [None]:
type(multi)

In [None]:
df