# Cancer Test Results

Here, I analyze a curated dataset containing information about the result of a cancer test done on patients with respect to whether or not they really have cancer. I do this to concrete my understanding of Conditional Probability and Bayes Rule.

In [1]:
# load dataset
import pandas as pd
df = pd.read_csv("cancer_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,test_result,has_cancer
0,0,79452,Negative,False
1,1,81667,Positive,True
2,2,76297,Negative,False
3,3,36593,Negative,False
4,4,53717,Negative,False


In [2]:
# number of patients
df.describe()

Unnamed: 0.1,Unnamed: 0,patient_id
count,2914.0,2914.0
mean,1456.5,56196.837337
std,841.343667,15341.382912
min,0.0,29304.0
25%,728.25,42873.0
50%,1456.5,55957.5
75%,2184.75,69730.5
max,2913.0,82300.0


In [3]:
# number of patients with cancer
len(df[df["has_cancer"] == True])

306

In [4]:
# number of patients without cancer
len(df[df["has_cancer"] == False])

2608

### Calculating Priors

In [5]:
# proportion of patients with cancer
len(df[df["has_cancer"] == True])/len(df)

0.10501029512697323

In [6]:
# proportion of patients without cancer
len(df[df["has_cancer"] == False])/len(df)

0.8949897048730268

### Calculating Joint Probabilities

In [7]:
# proportion of patients with cancer who test positive
patients_with_cancer = df[df["has_cancer"] == True]
len(patients_with_cancer[patients_with_cancer["test_result"] == "Positive"])/len(patients_with_cancer)

0.9052287581699346

In [8]:
# proportion of patients with cancer who test negative
len(patients_with_cancer[patients_with_cancer["test_result"] == "Negative"])/len(patients_with_cancer)

0.09477124183006536

In [9]:
# proportion of patients without cancer who test positive
patients_without_cancer = df[df["has_cancer"] == False]
len(patients_without_cancer[patients_without_cancer["test_result"] == "Positive"])/len(patients_without_cancer)

0.2036042944785276

In [10]:
# proportion of patients without cancer who test negative
len(patients_without_cancer[patients_without_cancer["test_result"] == "Negative"])/len(patients_without_cancer)

0.7963957055214724

## Calculating Posterior Probabilities

In [11]:
# What proportion of patients who tested positive has cancer?
positive_patients = df[df["test_result"] == "Positive"]
len(positive_patients[positive_patients["has_cancer"] == True])/len(positive_patients)

0.34282178217821785

In [12]:
# What proportion of patients who tested positive doesn't have cancer?
len(positive_patients[positive_patients["has_cancer"] == False])/len(positive_patients)

0.6571782178217822

In [13]:
# What proportion of patients who tested negative has cancer?
negative_patients = df[df["test_result"] == "Negative"]
len(negative_patients[negative_patients["has_cancer"] == True])/len(negative_patients)

0.013770180436847104

In [14]:
# What proportion of patients who tested negative doesn't have cancer?
len(negative_patients[negative_patients["has_cancer"] == False])/len(negative_patients)

0.9862298195631529