# Importing the data

At first we will clean the data.

## HIV dataset

In [112]:
import pandas as pd
import numpy as np

In [113]:
data = pd.read_csv("../Data/Raw/CDC/CDCHIVcountyracesex2017_2020.csv",skiprows=10)
print(data.shape)
data.head()

(154644, 8)


Unnamed: 0,Indicator,Year,Geography,FIPS,Sex,Cases,Rate per 100000,Percent
0,Receipt of HIV medical care,2020 (COVID-19 Pandemic),"Abbeville County, SC",45001,Male,39,,92.9
1,Receipt of HIV medical care,2020 (COVID-19 Pandemic),"Abbeville County, SC",45001,Female,7,,63.6
2,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Male,35,,87.5
3,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Female,10,,83.3
4,Receipt of HIV medical care,2018,"Abbeville County, SC",45001,Male,Data not available,,Data not available


##### Removing the 2020 (COVID-19 Pandemic) keyword to 2020

In [114]:
data.loc[data["Year"].str.startswith('2020'), "Year"] = "2020"

##### Removing the Data Surpressed and unavailable keywords in Cases

In [115]:
data_to_be_removed = ["Cases", "Rate per 100000", "Percent"]

In [116]:
for col in data_to_be_removed:
    data.loc[data[col].str.contains("Data",na = False), col] = np.nan
    data[col] = data[col].str.replace(',','')

In [117]:
data[["Year","FIPS","Cases","Rate per 100000","Percent"]] = data[["Year","FIPS","Cases","Rate per 100000","Percent"]].apply(pd.to_numeric)

In [118]:
data.dtypes
# data.head()

Indicator           object
Year                 int64
Geography           object
FIPS                 int64
Sex                 object
Cases              float64
Rate per 100000    float64
Percent            float64
dtype: object

In [119]:
data.head()

Unnamed: 0,Indicator,Year,Geography,FIPS,Sex,Cases,Rate per 100000,Percent
0,Receipt of HIV medical care,2020,"Abbeville County, SC",45001,Male,39.0,,92.9
1,Receipt of HIV medical care,2020,"Abbeville County, SC",45001,Female,7.0,,63.6
2,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Male,35.0,,87.5
3,Receipt of HIV medical care,2019,"Abbeville County, SC",45001,Female,10.0,,83.3
4,Receipt of HIV medical care,2018,"Abbeville County, SC",45001,Male,,,


In [120]:
data = data.pivot_table(index=['Year','Geography','FIPS'], columns=['Sex', 'Indicator']).reset_index()
data.columns = data.columns.map('_'.join).str.strip('_')
data.head()

Unnamed: 0,Year,Geography,FIPS,Cases_Female_HIV diagnoses,Cases_Female_HIV prevalence,Cases_Female_HIV viral suppression,Cases_Female_Linkage to HIV care,Cases_Female_PrEP coverage and number of persons prescribed,Cases_Female_Receipt of HIV medical care,Cases_Male_HIV diagnoses,...,Percent_Female_PrEP coverage and number of persons prescribed,Percent_Female_Receipt of HIV medical care,Percent_Male_HIV viral suppression,Percent_Male_Linkage to HIV care,Percent_Male_PrEP coverage and number of persons prescribed,Percent_Male_Receipt of HIV medical care,Rate per 100000_Female_HIV diagnoses,Rate per 100000_Female_HIV prevalence,Rate per 100000_Male_HIV diagnoses,Rate per 100000_Male_HIV prevalence
0,2017,"Abbeville County, SC",45001,0.0,12.0,,,,,0.0,...,,,,,,,0.0,109.5,0.0,356.1
1,2017,"Acadia Parish, LA",22001,0.0,42.0,,,,,6.0,...,,,,,,,0.0,159.2,24.5,413.2
2,2017,"Accomack County, VA",51001,,34.0,,,,,,...,,,,,,,,236.0,,470.6
3,2017,"Adair County, IA",19001,0.0,,,,,,0.0,...,,,,,,,0.0,,0.0,
4,2017,"Adair County, MO",29001,,8.0,,,,,,...,,,,,,,,68.7,,274.4


In [127]:
data.to_csv("../Data/clean/CDC/disagRaceSexHIV2017_2020_clean.csv",index=False)

In [128]:
pd.read_csv("../Data/clean/CDC/disagRaceSexHIV2017_2020_clean.csv")

Unnamed: 0,Year,Geography,FIPS,Cases_Female_HIV diagnoses,Cases_Female_HIV prevalence,Cases_Female_HIV viral suppression,Cases_Female_Linkage to HIV care,Cases_Female_PrEP coverage and number of persons prescribed,Cases_Female_Receipt of HIV medical care,Cases_Male_HIV diagnoses,...,Percent_Female_PrEP coverage and number of persons prescribed,Percent_Female_Receipt of HIV medical care,Percent_Male_HIV viral suppression,Percent_Male_Linkage to HIV care,Percent_Male_PrEP coverage and number of persons prescribed,Percent_Male_Receipt of HIV medical care,Rate per 100000_Female_HIV diagnoses,Rate per 100000_Female_HIV prevalence,Rate per 100000_Male_HIV diagnoses,Rate per 100000_Male_HIV prevalence
0,2017,"Abbeville County, SC",45001,0.0,12.0,,,,,0.0,...,,,,,,,0.0,109.5,0.0,356.1
1,2017,"Acadia Parish, LA",22001,0.0,42.0,,,,,6.0,...,,,,,,,0.0,159.2,24.5,413.2
2,2017,"Accomack County, VA",51001,,34.0,,,,,,...,,,,,,,,236.0,,470.6
3,2017,"Adair County, IA",19001,0.0,,,,,,0.0,...,,,,,,,0.0,,0.0,
4,2017,"Adair County, MO",29001,,8.0,,,,,,...,,,,,,,,68.7,,274.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9975,2020,"St. Joseph County, MI",26149,0.0,5.0,5.0,0.0,,5.0,0.0,...,,100.0,85.7,0.0,,91.4,0.0,19.8,0.0,139.5
9976,2020,"Vernon County, WI",55123,0.0,,,0.0,,,0.0,...,,,,0.0,,,0.0,,0.0,
9977,2020,"Washington County, AL",1129,0.0,,,0.0,,,0.0,...,,,,0.0,,,0.0,,0.0,
9978,2020,"Weakley County, TN",47183,0.0,0.0,0.0,0.0,,0.0,0.0,...,,0.0,76.5,0.0,,88.2,0.0,0.0,0.0,122.2


#### Experiment with STI

In [None]:
pd.read_csv("../Data/Raw/CDC/CDC STD 2016.csv",skiprows=4)

Unnamed: 0,Indicator,Year,Geography,FIPS,Race/Ethnicity,Sex,Cases,Rate per 100000
0,Primary and Secondary Syphilis,2016,"Abbeville County, SC",45001,White,Male,Data not available,Data not available
1,Primary and Secondary Syphilis,2016,"Abbeville County, SC",45001,White,Female,Data not available,Data not available
2,Primary and Secondary Syphilis,2016,"Abbeville County, SC",45001,Unknown,Male,Data not available,Data not available
3,Primary and Secondary Syphilis,2016,"Abbeville County, SC",45001,Unknown,Female,Data not available,Data not available
4,Primary and Secondary Syphilis,2016,"Abbeville County, SC",45001,Native Hawaiian/Other Pacific Islander,Male,Data not available,Data not available
...,...,...,...,...,...,...,...,...
206395,Chlamydia,2016,"Ziebach County, SD",46137,Black/African American,Female,Data not available,Data not available
206396,Chlamydia,2016,"Ziebach County, SD",46137,Asian,Male,Data not available,Data not available
206397,Chlamydia,2016,"Ziebach County, SD",46137,Asian,Female,Data not available,Data not available
206398,Chlamydia,2016,"Ziebach County, SD",46137,American Indian/Alaska Native,Male,Data not available,Data not available


In [4]:
import os
os.getcwd()

'/Users/chandramauli/Documents/My Files/bdsi22dataminingsti/notebooks'