## Observations and Insights 

In [42]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np

In [43]:
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [44]:
study_results = pd.read_csv(study_results_path)
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [45]:
# Combine the data into a single dataset
combined_df = study_results.merge(mouse_metadata, on = "Mouse ID")
combined_df.head()
# Display the data table for preview

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [46]:
# Checking the number of mice.
number_mice = combined_df["Mouse ID"].nunique()
number_mice

249

In [47]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicated_mouse_df=combined_df[combined_df.duplicated(["Mouse ID", "Timepoint"])]
duplicated_mouse_df


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
861,g989,0,45.0,0,Propriva,Female,21,26
863,g989,5,47.570392,0,Propriva,Female,21,26
865,g989,10,49.880528,0,Propriva,Female,21,26
867,g989,15,53.44202,0,Propriva,Female,21,26
869,g989,20,54.65765,1,Propriva,Female,21,26


In [48]:
# Optional: Get all the data for the duplicate mouse ID. 
#This step returns an array of ALL duplicated mouse data. Although our data only has 1 duplicate, if there were more this array would be longer
duplicated_mice = duplicated_mouse_df["Mouse ID"].unique()
duplicated_mice


array(['g989'], dtype=object)

In [49]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
#The for loop only goes 1 time in our case, but in other datasets this could run multiple times
clean_df=combined_df
for mouseid in duplicated_mice:
    clean_df=clean_df[clean_df["Mouse ID"]!= mouseid]

In [50]:
# Checking the number of mice in the clean DataFrame.
clean_df["Mouse ID"].nunique()

248

## Summary Statistics

In [72]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
regimen_group=clean_df.groupby("Drug Regimen")
regimens=regimen_df["Tumor Volume (mm3)"]
means=regimens.mean()
meds=regimens.median()
variance = regimens.var()
stdevs = regimens.std()
sems = regimens.sem()
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
summary_df = pd.DataFrame({"Mean": means,
              "Median": meds,
              "Variance": variance,
              "Standard Deviation": stdevs,
              "SEM": sems})
summary_df


Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [73]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
other_sum_df = clean_df["Tumor Volume "].groupby("Drug Regimen").agg(['min','median', 'var','std','sem'])
other_sum_df

Unnamed: 0_level_0,Timepoint,Timepoint,Timepoint,Timepoint,Timepoint,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),...,Age_months,Age_months,Age_months,Age_months,Age_months,Weight (g),Weight (g),Weight (g),Weight (g),Weight (g)
Unnamed: 0_level_1,min,median,var,std,sem,min,median,var,std,sem,...,min,median,var,std,sem,min,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Capomulin,0,20,206.928043,14.384994,0.948518,23.343598,41.557809,24.947764,4.994774,0.329346,...,1,16.5,59.620372,7.721423,0.509136,15,20.5,7.466034,2.732404,0.180169
Ceftamin,0,20,204.031772,14.283969,1.070629,45.0,51.776157,39.290177,6.268188,0.469821,...,2,12.0,65.147591,8.071406,0.604977,25,28.0,2.501016,1.58146,0.118535
Infubinol,0,15,181.53447,13.473473,1.009879,36.321346,51.820584,43.128684,6.567243,0.492236,...,1,20.0,56.404272,7.510278,0.562919,23,27.0,4.769028,2.18381,0.163684
Ketapril,0,20,196.839089,14.029935,1.023238,45.0,53.698743,68.553577,8.279709,0.60386,...,1,18.0,36.236432,6.01967,0.43903,25,28.0,3.392536,1.841884,0.134333
Naftisol,0,20,201.208951,14.184814,1.040081,45.0,52.509285,66.173479,8.134708,0.596466,...,2,9.0,45.102703,6.715855,0.49243,25,27.0,2.247748,1.499249,0.10993
Placebo,0,15,192.954266,13.890798,1.032495,45.0,52.288934,61.168083,7.821003,0.581331,...,1,10.0,40.384837,6.354907,0.472356,25,28.0,3.378146,1.837973,0.136615
Propriva,0,15,187.050699,13.676648,1.124214,45.0,50.446266,43.852013,6.622085,0.544332,...,1,7.5,48.251655,6.946341,0.570986,25,26.0,2.933995,1.712891,0.140799
Ramicane,0,20,203.796178,14.27572,0.945433,22.050126,40.673236,23.486704,4.846308,0.320955,...,1,9.0,35.362393,5.946629,0.393825,16,19.0,10.465318,3.235014,0.214244
Stelasyn,0,20,191.620626,13.84271,1.028921,45.0,52.431737,59.450562,7.710419,0.573111,...,1,14.0,63.036648,7.939562,0.590143,25,28.0,2.701473,1.643616,0.122169
Zoniferol,0,15,206.918979,14.384679,1.066263,45.0,51.818479,48.533355,6.966589,0.516398,...,2,12.5,33.479115,5.786114,0.428895,25,28.0,2.0153,1.419612,0.105229


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [10]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
