<a href="https://colab.research.google.com/github/Kiarro21/Kiarro21/blob/main/DataCleaning1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# modules we'll use
import pandas as pd
import numpy as np

# read in all our data
data = pd.read_csv("WebScrapedDiseaseData.csv")

# set seed for reproducibility
np.random.seed(0) 

In [32]:
# look at the first five rows of the nfl_data file. 
# I can see a handful of missing data already!
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [12]:
# get the number of missing data points per column
missing_values_count = data.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count[0:10]

Disease                        1732
Count of Disease Occurrence    1732
Symptom                           1
dtype: int64

In [21]:
# how many total missing values do we have?
total_cells = np.product(data.shape)
total_missing = missing_values_count.sum()

print(total_cells)
print(total_missing)

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

5598
3465
61.89710610932476


In [30]:
# remove all the rows that contain a missing value
data.dropna()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
12,UMLS:C0011847_diabetes,1421.0,UMLS:C0032617_polyuria
26,UMLS:C0011570_depression mental^UMLS:C0011581...,1337.0,UMLS:C0424000_feeling suicidal
47,UMLS:C0010054_coronary arteriosclerosis^UMLS:...,1284.0,UMLS:C0008031_pain chest
56,UMLS:C0032285_pneumonia,1029.0,UMLS:C0010200_cough
...,...,...,...
1806,UMLS:C1258215_ileus,56.0,UMLS:C0549483_abscess bacterial
1821,UMLS:C0001511_adhesion,57.0,UMLS:C0016204_flatulence
1834,UMLS:C0011253_delusion,56.0,UMLS:C0240233_loose associations
1855,UMLS:C0233472_affect labile,45.0,UMLS:C0277794_extreme exhaustion


In [29]:
# remove all columns with at least one missing value
columns_with_na_dropped = data.dropna(axis=1)
columns_with_na_dropped.head()

0
1
2
3
4


In [36]:
# just how much data did we lose?
print("Columns in original dataset: %d \n" % data.shape[1])
print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1])

Columns in original dataset: 3 

Columns with na's dropped: 0


In [39]:
# get a small subset of the NFL dataset
subset_data = data.loc[:, 'Disease':'Symptom'].head()
subset_data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [40]:
# replace all NA's with 0
subset_data.fillna(0)

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,0,0.0,UMLS:C0392680_shortness of breath
2,0,0.0,UMLS:C0012833_dizziness
3,0,0.0,UMLS:C0004093_asthenia
4,0,0.0,UMLS:C0085639_fall


In [42]:
# replace all NA's the value that comes directly after it in the same column, 
# then replace all the remaining na's with 0
subset_data.fillna(method='bfill', axis=0).fillna(0)

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,0,0.0,UMLS:C0392680_shortness of breath
2,0,0.0,UMLS:C0012833_dizziness
3,0,0.0,UMLS:C0004093_asthenia
4,0,0.0,UMLS:C0085639_fall
