## Observations and Insights 

In [1]:
%matplotlib notebook

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_data_complete = pd.merge(study_results, mouse_metadata, how="left", on=["Mouse ID", "Mouse ID"])

In [3]:
mouse_data_complete

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [4]:
# Checking the number of mice in the DataFrame.
mouse_data_complete["Mouse ID"].nunique()

249

In [5]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_data_complete["Mouse ID"].value_counts().head()

g989    13
m601    10
y163    10
y793    10
o926    10
Name: Mouse ID, dtype: int64

In [6]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_study = mouse_data_complete.set_index("Mouse ID")
bad_mouse = mouse_study.loc["g989", :]
bad_mouse

Unnamed: 0_level_0,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
g989,0,45.0,0,Propriva,Female,21,26
g989,0,45.0,0,Propriva,Female,21,26
g989,5,48.786801,0,Propriva,Female,21,26
g989,5,47.570392,0,Propriva,Female,21,26
g989,10,51.745156,0,Propriva,Female,21,26
g989,10,49.880528,0,Propriva,Female,21,26
g989,15,51.325852,1,Propriva,Female,21,26
g989,15,53.44202,0,Propriva,Female,21,26
g989,20,55.326122,1,Propriva,Female,21,26
g989,20,54.65765,1,Propriva,Female,21,26


In [7]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_data_complete = mouse_data_complete[mouse_data_complete["Mouse ID"] != "g989"]

In [8]:
# Checking the number of mice in the clean DataFrame.
mouse_data_complete["Mouse ID"].value_counts().head()

m601    10
a251    10
e227    10
t198    10
w422    10
Name: Mouse ID, dtype: int64

In [9]:
mouse_data_complete.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


## Summary Statistics

In [10]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_mean = mouse_data_complete.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()

tumor_median = mouse_data_complete.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()

tumor_var = mouse_data_complete.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()

tumor_std = mouse_data_complete.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()

tumor_sem = mouse_data_complete.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()
# This method is the most straighforward, creating multiple series and putting them all together at the end.

In [11]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_statistics = pd.DataFrame({"Mean": tumor_mean, "Median": tumor_median, "Variance": tumor_var, "Standard Deviation": tumor_std, "SEM": tumor_sem})

summary_statistics

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar Plots

In [12]:
mouse_data_complete["Drug Regimen"].value_counts()

Capomulin    230
Ramicane     228
Ketapril     188
Naftisol     186
Zoniferol    182
Placebo      181
Stelasyn     181
Infubinol    178
Ceftamin     178
Propriva     148
Name: Drug Regimen, dtype: int64

In [13]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.
# Filter the DataFrame down only to those columns to chart
mouse_and_timepoint = mouse_data_complete["Drug Regimen"].value_counts()

# Use DataFrame.plot() in order to create a bar chart of the data
mouse_and_timepoint.plot(kind="bar")

# Set a title for the chart
plt.title("Number of Data Points per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")

plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [14]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.
# Set x axis
x_axis = list(range(10))

plt.figure()
plt.bar(x_axis, mouse_and_timepoint)
plt.xticks(x_axis, mouse_and_timepoint.index, rotation="vertical")
plt.title("Number of Data Points per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")
plt.tight_layout()

<IPython.core.display.Javascript object>

## Pie Plots

In [15]:
mouse_data_complete.groupby("Sex")["Mouse ID"].nunique()

Sex
Female    123
Male      125
Name: Mouse ID, dtype: int64

In [16]:
pie = mouse_data_complete.groupby("Sex")["Mouse ID"].nunique()
colors = ["tab:orange", "tab:blue"]

In [17]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
plt.figure()
pie.plot(kind="pie", autopct="%1.1f%%", startangle=180, colors=colors)
plt.title("Mice by Sex")
plt.ylabel("Sex")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [18]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.figure()
plt.pie(pie, autopct="%1.1f%%", startangle=180, colors=colors)
plt.title("Mice by Sex")
plt.ylabel("Sex")
plt.tight_layout()

<IPython.core.display.Javascript object>

## Quartiles, Outliers and Boxplots

In [21]:
final_timepoint = mouse_data_complete.groupby("Mouse ID")["Timepoint"].max()
final_timepoint = pd.DataFrame(final_timepoint)
final_timepoint

Unnamed: 0_level_0,Timepoint
Mouse ID,Unnamed: 1_level_1
a203,45
a251,45
a262,45
a275,45
a366,30
...,...
z435,10
z578,45
z581,45
z795,45


In [22]:
final_timepoint = pd.merge(final_timepoint, mouse_data_complete, how="left", on=["Mouse ID", "Timepoint"])
final_timepoint.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,a203,45,67.973419,2,Infubinol,Female,20,23
1,a251,45,65.525743,1,Infubinol,Female,21,25
2,a262,45,70.717621,4,Placebo,Female,17,29
3,a275,45,62.999356,3,Ceftamin,Female,20,28
4,a366,30,63.440686,1,Stelasyn,Female,16,29


In [23]:
capomulin = final_timepoint[final_timepoint["Drug Regimen"] == "Capomulin"]
capomulin

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
19,b128,45,38.982878,2,Capomulin,Female,9,22
24,b742,45,38.939633,0,Capomulin,Male,7,21
61,f966,20,30.485985,0,Capomulin,Male,16,17
64,g288,45,37.074024,1,Capomulin,Male,3,19
66,g316,45,40.15922,2,Capomulin,Female,22,22
80,i557,45,47.685963,1,Capomulin,Female,1,24
83,i738,45,37.311846,2,Capomulin,Female,23,20
85,j119,45,38.125164,1,Capomulin,Female,7,23
87,j246,35,38.753265,1,Capomulin,Female,21,21
108,l509,45,41.483008,3,Capomulin,Male,17,21


In [24]:
quartiles_cap = capomulin["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq_cap = quartiles_cap[0.25]
upperq_cap = quartiles_cap[0.75]
iqr_cap = upperq_cap-lowerq_cap
lower_bound_cap = lowerq_cap - (1.5*iqr_cap)
upper_bound_cap = upperq_cap + (1.5*iqr_cap)

print(f"The lower quartile of Capomulin is: {lowerq_cap}")
print(f"The upper quartile of Capomulin is: {upperq_cap}")
print(f"The interquartile range of Capomulin is: {iqr_cap}")
print(f"Values below {lower_bound_cap} could be outliers.")
print(f"Values above {upper_bound_cap} could be outliers.")

The lower quartile of Capomulin is: 32.37735684
The upper quartile of Capomulin is: 40.1592203
The interquartile range of Capomulin is: 7.781863460000004
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [25]:
ramicane = final_timepoint[final_timepoint["Drug Regimen"] == "Ramicane"]
ramicane

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
6,a411,45,38.407618,1,Ramicane,Male,3,22
7,a444,45,43.047543,0,Ramicane,Female,10,25
10,a520,45,38.810366,1,Ramicane,Male,13,21
12,a644,45,32.978522,1,Ramicane,Female,7,17
33,c458,30,38.342008,2,Ramicane,Female,23,20
37,c758,45,33.397653,1,Ramicane,Male,9,17
45,d251,45,37.311236,2,Ramicane,Female,8,19
52,e662,45,40.659006,2,Ramicane,Male,8,24
70,g791,45,29.128472,1,Ramicane,Male,11,16
76,i177,45,33.562402,3,Ramicane,Male,10,18


In [26]:
quartiles_ram = ramicane["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq_ram = quartiles_ram[0.25]
upperq_ram = quartiles_ram[0.75]
iqr_ram = upperq_ram-lowerq_ram
lower_bound_ram = lowerq_ram - (1.5*iqr_ram)
upper_bound_ram = upperq_ram + (1.5*iqr_ram)

print(f"The lower quartile of Ramicane is: {lowerq_ram}")
print(f"The upper quartile of Ramicane is: {upperq_ram}")
print(f"The interquartile range of Ramicane is: {iqr_ram}")
print(f"Values below {lower_bound_ram} could be outliers.")
print(f"Values above {upper_bound_ram} could be outliers.")

The lower quartile of Ramicane is: 31.56046955
The upper quartile of Ramicane is: 40.65900627
The interquartile range of Ramicane is: 9.098536719999998
Values below 17.912664470000003 could be outliers.
Values above 54.30681135 could be outliers.


In [27]:
infubinol = final_timepoint[final_timepoint["Drug Regimen"] == "Infubinol"]
infubinol

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,a203,45,67.973419,2,Infubinol,Female,20,23
1,a251,45,65.525743,1,Infubinol,Female,21,25
11,a577,30,57.031862,2,Infubinol,Female,6,25
13,a685,45,66.083066,3,Infubinol,Male,8,30
27,c139,45,72.226731,2,Infubinol,Male,11,28
31,c326,5,36.321346,0,Infubinol,Female,18,25
41,c895,30,60.969711,2,Infubinol,Female,7,29
50,e476,45,62.435404,1,Infubinol,Male,23,26
56,f345,45,60.918767,1,Infubinol,Male,23,26
78,i386,40,67.289621,4,Infubinol,Female,23,29


In [28]:
quartiles_inf = infubinol["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq_inf = quartiles_inf[0.25]
upperq_inf = quartiles_inf[0.75]
iqr_inf = upperq_inf-lowerq_inf
lower_bound_inf = lowerq_inf - (1.5*iqr_inf)
upper_bound_inf = upperq_inf + (1.5*iqr_inf)

print(f"The lower quartile of Infubinol is: {lowerq_inf}")
print(f"The upper quartile of Infubinol is: {upperq_inf}")
print(f"The interquartile range of Infubinol is: {iqr_inf}")
print(f"Values below {lower_bound_inf} could be outliers.")
print(f"Values above {upper_bound_inf} could be outliers.")

The lower quartile of Infubinol is: 54.04860769
The upper quartile of Infubinol is: 65.52574285
The interquartile range of Infubinol is: 11.477135160000003
Values below 36.83290494999999 could be outliers.
Values above 82.74144559000001 could be outliers.


In [29]:
ceftamin = final_timepoint[final_timepoint["Drug Regimen"] == "Ceftamin"]
ceftamin

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
3,a275,45,62.999356,3,Ceftamin,Female,20,28
21,b447,0,45.0,0,Ceftamin,Male,2,30
22,b487,25,56.057749,1,Ceftamin,Female,6,28
25,b759,30,55.742829,1,Ceftamin,Female,12,25
58,f436,15,48.722078,2,Ceftamin,Female,3,25
75,h531,5,47.784682,0,Ceftamin,Male,5,27
88,j296,45,61.849023,3,Ceftamin,Female,24,30
94,k210,45,68.923185,3,Ceftamin,Male,15,28
106,l471,45,67.748662,1,Ceftamin,Female,7,28
107,l490,30,57.918381,3,Ceftamin,Male,24,26


In [30]:
quartiles_cef = ceftamin["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq_cef = quartiles_cef[0.25]
upperq_cef = quartiles_cef[0.75]
iqr_cef = upperq_ram-lowerq_ram
lower_bound_cef = lowerq_cef - (1.5*iqr_cef)
upper_bound_cef = upperq_cef + (1.5*iqr_cef)

print(f"The lower quartile of Ceftamin is: {lowerq_cef}")
print(f"The upper quartile of Ceftamin is: {upperq_cef}")
print(f"The interquartile range of Ceftamin is: {iqr_cef}")
print(f"Values below {lower_bound_cef} could be outliers.")
print(f"Values above {upper_bound_cef} could be outliers.")

The lower quartile of Ceftamin is: 48.72207785
The upper quartile of Ceftamin is: 64.29983003
The interquartile range of Ceftamin is: 9.098536719999998
Values below 35.07427277 could be outliers.
Values above 77.94763511 could be outliers.


In [31]:
box_dict = {"Capomulin": capomulin["Tumor Volume (mm3)"], "Ramicane": ramicane["Tumor Volume (mm3)"], "Infubinol": infubinol["Tumor Volume (mm3)"], "Ceftamin": ceftamin["Tumor Volume (mm3)"]}
outlier = dict(marker='o', markerfacecolor="red", markeredgecolor="black", markersize=12)
fig, ax = plt.subplots()
ax.boxplot(box_dict.values(), flierprops=outlier)
ax.set_xticklabels(box_dict.keys())
ax.set_ylabel("Final Tumor Volume (mm3)")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Final Tumor Volume (mm3)')

In [32]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [33]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [34]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
mouse = mouse_data_complete.loc[mouse_data_complete["Mouse ID"] == "b128"]
mouse_timepoint = mouse["Timepoint"].values
mouse_tumor = mouse["Tumor Volume (mm3)"].values
plt.figure()
plt.plot(mouse_timepoint, mouse_tumor)
plt.xticks(mouse_timepoint)
plt.title("Capomulin Treatment of Mouse b128")
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume (mm3)")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [35]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_mice = mouse_data_complete[mouse_data_complete["Drug Regimen"] == "Capomulin"]
capomulin_average = capomulin_mice.groupby("Mouse ID")["Tumor Volume (mm3)"].mean()
capomulin_weight = capomulin_mice.groupby("Mouse ID")["Weight (g)"].mean()
plt.figure()
plt.scatter(capomulin_weight,capomulin_average)
plt.title("Mouse Weight vs. Average Tumor Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume")
plt.show()

<IPython.core.display.Javascript object>

## Correlation and Regression

In [36]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
print(f"The correlation coefficient between mouse weight and the average tumor volume is {round(st.pearsonr(capomulin_weight,capomulin_average)[0],2)}")
plt.figure()
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(capomulin_weight,capomulin_average)
regress_values = capomulin_weight * slope + intercept
plt.scatter(capomulin_weight,capomulin_average)
plt.plot(capomulin_weight,regress_values,"r-")
plt.title("Mouse Weight vs. Average Tumor Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume")
plt.show()

The correlation coefficient between mouse weight and the average tumor volume is 0.84


<IPython.core.display.Javascript object>