## Observations and Insights 

In [31]:
# Dependencies and Setup
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

complete_data=pd.merge(mouse_metadata, study_results, how= "left", on= ["Mouse ID", "Mouse ID"])



# Display the data table for preview
complete_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [32]:
# Checking the number of mice.
complete_data["Mouse ID"].count()

1893

In [33]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

complete_data.groupby(["Mouse ID", "Timepoint"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Drug Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a203,0,1,1,1,1,1,1
a203,5,1,1,1,1,1,1
a203,10,1,1,1,1,1,1
a203,15,1,1,1,1,1,1
a203,20,1,1,1,1,1,1
...,...,...,...,...,...,...,...
z969,25,1,1,1,1,1,1
z969,30,1,1,1,1,1,1
z969,35,1,1,1,1,1,1
z969,40,1,1,1,1,1,1


In [34]:
# Optional: Get all the data for the duplicate mouse ID. 

len(complete_data["Mouse ID"].unique())

249

In [35]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data= complete_data.drop_duplicates(["Mouse ID", "Timepoint"])
clean_data.tail(20)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
1873,z795,Naftisol,Female,13,29,0,45.0,0
1874,z795,Naftisol,Female,13,29,5,46.833475,0
1875,z795,Naftisol,Female,13,29,10,48.030804,0
1876,z795,Naftisol,Female,13,29,15,49.368132,1
1877,z795,Naftisol,Female,13,29,20,53.280657,1
1878,z795,Naftisol,Female,13,29,25,57.9471,2
1879,z795,Naftisol,Female,13,29,30,59.789636,2
1880,z795,Naftisol,Female,13,29,35,60.435654,2
1881,z795,Naftisol,Female,13,29,40,62.376639,3
1882,z795,Naftisol,Female,13,29,45,65.74107,3


In [36]:
# Checking the number of mice in the clean DataFrame.
mice= len(clean_data["Mouse ID"]. unique())
mice


249

## Summary Statistics

In [37]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean= clean_data.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"]
mean

median= clean_data.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"]
median

variance= clean_data.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"]
variance

std= clean_data.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"]
std

sem= clean_data.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"]
sem

drug_df= pd.DataFrame({"Mean": mean,
                      "Median": median,
                       "Variance": variance,
                       "Standard Deviation": std,
                       "SEM": sem
                      })
drug_df
# This method is the most straighforward, creating multiple series and putting them all together at the end.



Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [38]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [39]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
plot_data= clean_data.groupby(["Drug Regimen"]).count()["Mouse ID"]

plot_data

plot_data.plot(kind="bar", facecolor="blue")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1dd9ddc2430>

In [42]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = np.arange(len(drug_df))
tick_locations = [value+0.4 for value in x_axis]

bar=plt.bar(x_axis, tick_locations,color="blue", align= "center")

bar

<BarContainer object of 10 artists>

In [14]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

m_f= clean_data.groupby(["Sex"])

m_f.plot(kind="pie", y= m_f, figsize=(5,5))

ValueError: cannot copy sequence with size 930 to array axis with dimension 8

In [52]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
count_mf=clean_data.groupby(["Sex"]).count()
count_mf

labels=["Male", "Female"]
count=[958, 930]
colors=["pink", 'blue']
explode= (0.1,0)

plt.pie(count, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")
plt.show()

## Quartiles, Outliers and Boxplots

In [53]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin



# Start by getting the last (greatest) timepoint for each mouse
each_mouse= np.max(clean_data)["Timepoint"]

each_mouse

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


45

In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
