# Importing the data

At first we will clean the data.

## HIV dataset

In [43]:
import pandas as pd
import numpy as np

In [44]:
data = pd.read_csv("../Data/Raw/CDC/CDCHIVcountyracesex2017_2020.csv",skiprows=10)
print(data.shape)
data.head()

(154644, 8)


Unnamed: 0,Indicator,Year,Geography,FIPS,Sex,Cases,Rate per 100000,Percent
0,Receipt of HIV medical care,2020 (COVID-19 Pandemic),"Abbeville County, SC",45001,Male,39,,92.9
1,Receipt of HIV medical care,2020 (COVID-19 Pandemic),"Abbeville County, SC",45001,Female,7,,63.6
2,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Male,35,,87.5
3,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Female,10,,83.3
4,Receipt of HIV medical care,2018,"Abbeville County, SC",45001,Male,Data not available,,Data not available


In [45]:
data = data.rename(columns = {'Cases':'cases',
                              'Rate per 100000':'ratePer100000',
                             'Indicator':'indicator',
                             'Year':'year',
                             'Geography':'geography',
                             'FIPS':'fips',
                             'Percent':'percent',
                             'Sex':'sex'})
data.head()

Unnamed: 0,indicator,year,geography,fips,sex,cases,ratePer100000,percent
0,Receipt of HIV medical care,2020 (COVID-19 Pandemic),"Abbeville County, SC",45001,Male,39,,92.9
1,Receipt of HIV medical care,2020 (COVID-19 Pandemic),"Abbeville County, SC",45001,Female,7,,63.6
2,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Male,35,,87.5
3,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Female,10,,83.3
4,Receipt of HIV medical care,2018,"Abbeville County, SC",45001,Male,Data not available,,Data not available


##### Removing the 2020 (COVID-19 Pandemic) keyword to 2020

In [46]:
data.loc[data["year"].str.startswith('2020'), "year"] = "2020"

##### Removing the Data Surpressed and unavailable keywords in Cases

In [47]:
data_to_be_removed = ["cases", "ratePer100000", "percent"]

In [48]:
for col in data_to_be_removed:
    data.loc[data[col].str.contains("Data",na = False), col] = np.nan
    data[col] = data[col].str.replace(',','')

In [49]:
data[["year","fips","cases","ratePer100000","percent"]] = data[["year","fips","cases","ratePer100000","percent"]].apply(pd.to_numeric)

In [50]:
data.dtypes
# data.head()

indicator         object
year               int64
geography         object
fips               int64
sex               object
cases            float64
ratePer100000    float64
percent          float64
dtype: object

In [51]:
data = data.pivot_table(index=['year','geography','fips'], columns=['sex', 'indicator']).reset_index()
data.columns = data.columns.map('_'.join).str.strip('_')
data.head()

Unnamed: 0,year,geography,fips,cases_Female_HIV diagnoses,cases_Female_HIV prevalence,cases_Female_HIV viral suppression,cases_Female_Linkage to HIV care,cases_Female_PrEP coverage and number of persons prescribed,cases_Female_Receipt of HIV medical care,cases_Male_HIV diagnoses,...,percent_Female_PrEP coverage and number of persons prescribed,percent_Female_Receipt of HIV medical care,percent_Male_HIV viral suppression,percent_Male_Linkage to HIV care,percent_Male_PrEP coverage and number of persons prescribed,percent_Male_Receipt of HIV medical care,ratePer100000_Female_HIV diagnoses,ratePer100000_Female_HIV prevalence,ratePer100000_Male_HIV diagnoses,ratePer100000_Male_HIV prevalence
0,2017,"Abbeville County, SC",45001,0.0,12.0,,,,,0.0,...,,,,,,,0.0,109.5,0.0,356.1
1,2017,"Acadia Parish, LA",22001,0.0,42.0,,,,,6.0,...,,,,,,,0.0,159.2,24.5,413.2
2,2017,"Accomack County, VA",51001,,34.0,,,,,,...,,,,,,,,236.0,,470.6
3,2017,"Adair County, IA",19001,0.0,,,,,,0.0,...,,,,,,,0.0,,0.0,
4,2017,"Adair County, MO",29001,,8.0,,,,,,...,,,,,,,,68.7,,274.4


In [52]:
pd.unique(data["year"])

array([2017, 2018, 2019, 2020])

In [53]:
# data.to_csv("..//disagRaceSexHIV2017_2020.csv")

In [None]:
# data.to_csv(\"../Data/clean/CDC/disagRaceSexHIV2017_2020_clean.csv\",index=False)