## Observations and Insights 

In [1]:
%matplotlib notebook

In [None]:
# 1. there was a strong correlation between mouse weight and average tumour volume
# 2. Capomulin and Ramicane had lower tumour volume on average when compared to Infubinol and Ceftamin. 
# This indicates that the former too regimens are likely to be more effective than the latter.
# 3. There was a significant decrease in tumour volume for mouse "g316" within the last 5 days of treatment on Capumolin

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combine = study_results.merge(mouse_metadata, on = "Mouse ID")
# Display the data table for preview
combine

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [3]:
# Checking the number of mice.
combine_gb = combine.groupby("Mouse ID")
mouse_count = len(combine_gb.count())
mouse_count

249

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dups = combine[combine.duplicated(['Mouse ID', 'Timepoint'], keep=False)]
dups

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
860,g989,0,45.0,0,Propriva,Female,21,26
861,g989,0,45.0,0,Propriva,Female,21,26
862,g989,5,48.786801,0,Propriva,Female,21,26
863,g989,5,47.570392,0,Propriva,Female,21,26
864,g989,10,51.745156,0,Propriva,Female,21,26
865,g989,10,49.880528,0,Propriva,Female,21,26
866,g989,15,51.325852,1,Propriva,Female,21,26
867,g989,15,53.44202,0,Propriva,Female,21,26
868,g989,20,55.326122,1,Propriva,Female,21,26
869,g989,20,54.65765,1,Propriva,Female,21,26


In [5]:
# Optional: Get all the data for the duplicate mouse ID. 

dupID = combine.loc[combine["Mouse ID"] == "g989"]
dupID

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
860,g989,0,45.0,0,Propriva,Female,21,26
861,g989,0,45.0,0,Propriva,Female,21,26
862,g989,5,48.786801,0,Propriva,Female,21,26
863,g989,5,47.570392,0,Propriva,Female,21,26
864,g989,10,51.745156,0,Propriva,Female,21,26
865,g989,10,49.880528,0,Propriva,Female,21,26
866,g989,15,51.325852,1,Propriva,Female,21,26
867,g989,15,53.44202,0,Propriva,Female,21,26
868,g989,20,55.326122,1,Propriva,Female,21,26
869,g989,20,54.65765,1,Propriva,Female,21,26


In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean = combine.drop(dupID.index, axis=0)
clean

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [7]:
# Checking the number of mice in the clean DataFrame.
newmousecount = clean.groupby("Mouse ID")

newmousecountlength = len(newmousecount.count())
newmousecountlength

248

## Summary Statistics

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

regimen = clean.groupby("Drug Regimen")
mean_reg = regimen["Tumor Volume (mm3)"].mean()
median_reg = regimen["Tumor Volume (mm3)"].median()
std_reg = regimen["Tumor Volume (mm3)"].std()
sem_reg = regimen["Tumor Volume (mm3)"].sem()
var_reg = regimen["Tumor Volume (mm3)"].var()

tum_regimen_sst = pd.DataFrame({"Mean(mm3)":mean_reg, "Median(mm3)":median_reg, "Std Deviation":std_reg, "SEM":sem_reg, "Variance":var_reg})
tum_regimen_sst.round(2)

Unnamed: 0_level_0,Mean(mm3),Median(mm3),Std Deviation,SEM,Variance
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,4.99,0.33,24.95
Ceftamin,52.59,51.78,6.27,0.47,39.29
Infubinol,52.88,51.82,6.57,0.49,43.13
Ketapril,55.24,53.7,8.28,0.6,68.55
Naftisol,54.33,52.51,8.13,0.6,66.17
Placebo,54.03,52.29,7.82,0.58,61.17
Propriva,52.32,50.45,6.62,0.54,43.85
Ramicane,40.22,40.67,4.85,0.32,23.49
Stelasyn,54.23,52.43,7.71,0.57,59.45
Zoniferol,53.24,51.82,6.97,0.52,48.53


In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [18]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
tot_mice = clean.groupby("Drug Regimen")["Mouse ID"].count()
tot_mice = pd.DataFrame(tot_mice)
tot_mice.plot(kind="bar")
plt.title("Total Mice per Treatment")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [19]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
drug = list(tot_mice.index.values)

x_vals = np.arange(1,11,1)
y_vals = tot_mice["Mouse ID"]
plt.bar(x_vals, y_vals, width=0.5)
plt.xticks(x_vals, drug, rotation="vertical")
plt.xlabel("Regimen")
plt.title("Total Mice per Treatment")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [141]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender = clean["Sex"].value_counts()
gender.plot(kind="pie", y="Sex", autopct='%1.1f%%', title = "Female vs Male Mice", startangle = 90)
plt.ylabel(" ")
plt.axis("equal")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [21]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ["Male", "Female"]

plt.pie(gender, labels=labels, autopct='%1.1f%%', startangle = 90)
plt.title("Female vs Male Mice")
plt.tight_layout()

<IPython.core.display.Javascript object>

## Quartiles, Outliers and Boxplots

In [15]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse

T_P = combine.groupby("Mouse ID")["Timepoint"].max()

T_Pdf = pd.DataFrame(T_P)
T_Pdf = T_Pdf.rename(columns={"Timepoint":"Max Timepoint"})

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

FTV_grpd = clean.merge(T_Pdf, on="Mouse ID")
final_tumour_vols = FTV_grpd[FTV_grpd["Timepoint"] == FTV_grpd["Max Timepoint"]]
final_tumour_vols


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g),Max Timepoint
9,b128,45,38.982878,2,Capomulin,Female,9,22,45
10,f932,0,45.000000,0,Ketapril,Male,15,29,0
18,g107,35,71.905117,0,Ketapril,Female,2,29,35
21,a457,10,49.783419,0,Ketapril,Female,11,30,10
30,c819,40,62.175705,1,Ketapril,Male,21,25,40
...,...,...,...,...,...,...,...,...,...
1844,t565,45,34.455298,0,Capomulin,Female,20,17,45
1854,i557,45,47.685963,1,Capomulin,Female,1,24,45
1864,m957,45,33.329098,1,Capomulin,Female,3,19,45
1869,f966,20,30.485985,0,Capomulin,Male,16,17,20


In [186]:
# Put treatments into a list for for loop (and later for plot labels)

treatment = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)

tum_col = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
  # Locate the rows which contain mice on each drug and get the tumor volumes
       
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    
for drug in treatment:
    max_tum = final_tumour_vols[final_tumour_vols["Drug Regimen"]== drug]["Tumor Volume (mm3)"]
    max_tum = max_tum.rename(drug)
    quartiles = max_tum.quantile([0.25, 0.5, 0.75])
    lower_q = quartiles[0.25]
    upper_q = quartiles[0.75]
    IQR = upper_q-lower_q 
    lower_b = lower_q - (1.5*IQR)
    upper_b = upper_q + (1.5*IQR)
    
    outliers = max_tum.loc[(max_tum > upper_b) | (max_tum < lower_b)]
    outliers["Outlier Value = "] = outliers.tolist()
    print(outliers)
    
    tum_col.append(max_tum)

Outlier Value =     []
Name: Capomulin, dtype: object
Outlier Value =     []
Name: Ramicane, dtype: object
214                              36.3213
Outlier Value =     [36.321345799999996]
Name: Infubinol, dtype: object
Outlier Value =     []
Name: Ceftamin, dtype: object


In [171]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
tum_df = pd.DataFrame(tum_col)
tum_df = tum_df.T
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumour Volume Across Drug Regimens")
ax1.set_ylabel("Tumour Volume")
tum_df.boxplot()
plt.show()

<IPython.core.display.Javascript object>

## Line and Scatter Plots

In [138]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capo = clean[(clean == 'g316').any(axis=1)]

capo.plot.line(x="Timepoint", y="Tumor Volume (mm3)", title="Tumour Size Change for Mouse g316")
xtic = np.arange(0, 50, 5)
plt.grid()
plt.ylabel("Tumor Volume (mm3)")
plt.xticks(xtic)
plt.show()

<IPython.core.display.Javascript object>

In [139]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capo_weight = clean.groupby(["Drug Regimen", "Mouse ID"]).mean()
capo_weight = capo_weight.loc["Capomulin"]
capo_weight = capo_weight.rename(columns={"Tumor Volume (mm3)":"Average Tumor Vol"})

capo_weight.plot(kind="scatter", x="Weight (g)", y="Average Tumor Vol", grid=True, figsize=(8,8), title="Mouse Weight vs Average Tumor Volume")
plt.show()

<IPython.core.display.Javascript object>

## Correlation and Regression

In [142]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

mouse_weight = capo_weight.iloc[:, 4]
mouse_weight = mouse_weight.sort_values(ascending=False)
avg_vol = capo_weight.iloc[:, 1]
avg_vol = avg_vol.sort_values(ascending=False)


slope, intercept, rvalue, pvalue, std_err = st.linregress(mouse_weight, avg_vol)

fit = slope * mouse_weight + intercept
line_eq = "y =" + str(round(slope, 2)) + "x +" + str(round(intercept, 2))
r_value = "R value = " + str(round(rvalue, 2))
plt.scatter(mouse_weight, avg_vol )
plt.plot(mouse_weight, fit, "r-")
plt.xlabel("Weight (G)")
plt.ylabel("Average Tumour Volume")
plt.title("Mouse Weight vs Average Tumor Volume")
plt.annotate(line_eq, (20 ,36), fontsize=10, color='red')
plt.annotate(r_value, (20, 38), fontsize=10, color='blue')
plt.show()


<IPython.core.display.Javascript object>