## Observations and Insights 

In [12]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata,study_results,on="Mouse ID")

# Display the data table for preview
combined_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [13]:
# Checking the number of mice.
len(combined_df['Mouse ID'])

1893

In [14]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicateRows_DF = combined_df[combined_df.duplicated()]
duplicateRows_DF['Mouse ID']

909    g989
Name: Mouse ID, dtype: object

In [15]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicateRows_DF = combined_df[combined_df.duplicated()]
duplicateRows_DF

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0


In [16]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_combined_df = combined_df.drop_duplicates(keep='first',inplace=False)


In [17]:
# Checking the number of mice in the clean DataFrame.
len(clean_combined_df['Mouse ID'])

1892

## Summary Statistics

In [20]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_regimen_group = clean_combined_df.groupby('Drug Regimen')
mean_drugs = drug_regimen_group['Tumor Volume (mm3)'].mean()

drug_list = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']

# All data for each drug regimen
d1 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Capomulin")]
d2 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Ceftamin")]
d3 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Infubinol")]
d4 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Ketapril")]
d5 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Naftisol")]
d6 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Placebo")]
d7 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Propriva")]
d8 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Ramicane")]
d9 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Stelasyn")]
d10 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Zoniferol")]

# All tumor volumes for each drug regimen
d1_vol = d1['Tumor Volume (mm3)']
d2_vol = d2['Tumor Volume (mm3)']
d3_vol = d3['Tumor Volume (mm3)']
d4_vol = d4['Tumor Volume (mm3)']
d5_vol = d5['Tumor Volume (mm3)']
d6_vol = d6['Tumor Volume (mm3)']
d7_vol = d7['Tumor Volume (mm3)']
d8_vol = d8['Tumor Volume (mm3)']
d9_vol = d9['Tumor Volume (mm3)']
d10_vol = d10['Tumor Volume (mm3)']

# All mean tumor volumes for each drug regimen
mean_d1_vol = np.mean(d1_vol)
mean_d2_vol = np.mean(d2_vol)
mean_d3_vol = np.mean(d3_vol)
mean_d4_vol = np.mean(d4_vol)
mean_d5_vol = np.mean(d5_vol)
mean_d6_vol = np.mean(d6_vol)
mean_d7_vol = np.mean(d7_vol)
mean_d8_vol = np.mean(d8_vol)
mean_d9_vol = np.mean(d9_vol)
mean_d10_vol = np.mean(d10_vol)

# All median tumor volumes for each drug regimen
median_d1_vol = np.median(d1_vol)
median_d2_vol = np.median(d2_vol)
median_d3_vol = np.median(d3_vol)
median_d4_vol = np.median(d4_vol)
median_d5_vol = np.median(d5_vol)
median_d6_vol = np.median(d6_vol)
median_d7_vol = np.median(d7_vol)
median_d8_vol = np.median(d8_vol)
median_d9_vol = np.median(d9_vol)
median_d10_vol = np.median(d10_vol)

# All mode tumor volumes for each drug regimen
mode_d1_vol = st.mode(d1_vol)
mode_d2_vol = st.mode(d2_vol)
mode_d3_vol = st.mode(d3_vol)
mode_d4_vol = st.mode(d4_vol)
mode_d5_vol = st.mode(d5_vol)
mode_d6_vol = st.mode(d6_vol)
mode_d7_vol = st.mode(d7_vol)
mode_d8_vol = st.mode(d8_vol)
mode_d9_vol = st.mode(d9_vol)
mode_d10_vol = st.mode(d10_vol)

# All variance tumor volumes for each drug regimen
var_d1_vol = np.var(d1_vol,ddof = 0)
var_d2_vol = np.var(d2_vol,ddof = 0)
var_d3_vol = np.var(d3_vol,ddof = 0)
var_d4_vol = np.var(d4_vol,ddof = 0)
var_d5_vol = np.var(d5_vol,ddof = 0)
var_d6_vol = np.var(d6_vol,ddof = 0)
var_d7_vol = np.var(d7_vol,ddof = 0)
var_d8_vol = np.var(d8_vol,ddof = 0)
var_d9_vol = np.var(d9_vol,ddof = 0)
var_d10_vol = np.var(d10_vol,ddof = 0)

# All standard deviations tumor volumes for each drug regimen
std_d1_vol = np.std(d1_vol,ddof = 0)
std_d2_vol = np.std(d2_vol,ddof = 0)
std_d3_vol = np.std(d3_vol,ddof = 0)
std_d4_vol = np.std(d4_vol,ddof = 0)
std_d5_vol = np.std(d5_vol,ddof = 0)
std_d6_vol = np.std(d6_vol,ddof = 0)
std_d7_vol = np.std(d7_vol,ddof = 0)
std_d8_vol = np.std(d8_vol,ddof = 0)
std_d9_vol = np.std(d9_vol,ddof = 0)
std_d10_vol = np.std(d10_vol,ddof = 0)

# All SEM tumor volumes for each drug regimen
sem_d1_vol = sem(d1_vol)
sem_d2_vol = sem(d2_vol)
sem_d3_vol = sem(d3_vol)
sem_d4_vol = sem(d4_vol)
sem_d5_vol = sem(d5_vol)
sem_d6_vol = sem(d6_vol)
sem_d7_vol = sem(d7_vol)
sem_d8_vol = sem(d8_vol)
sem_d9_vol = sem(d9_vol)
sem_d10_vol = sem(d10_vol)


drug_regimen_group['Tumor Volume (mm3)'].mean()
mean_drugs
# This method is the most straighforward, creating multiple series and putting them all together at the end.
summary_statistics = pd.DataFrame({'Drug Regimen': drug_list,
                                   'Mean': [mean_d1_vol, mean_d2_vol, mean_d3_vol, mean_d4_vol, mean_d5_vol, mean_d6_vol, mean_d7_vol, mean_d8_vol, mean_d9_vol, mean_d10_vol],
                                   'Median': [median_d1_vol, median_d2_vol, median_d3_vol, median_d4_vol, median_d5_vol, median_d6_vol, median_d7_vol, median_d8_vol, median_d9_vol, median_d10_vol],
                                   'Mode': [mode_d1_vol, mode_d2_vol, mode_d3_vol, mode_d4_vol, mode_d5_vol, mode_d6_vol, mode_d7_vol, mode_d8_vol, mode_d9_vol, mode_d10_vol],
                                   'Variance': [var_d1_vol, var_d2_vol, var_d3_vol, var_d4_vol, var_d5_vol, var_d6_vol, var_d7_vol, var_d8_vol, var_d9_vol, var_d10_vol],
                                   'Standard Deviation': [std_d1_vol, std_d2_vol, std_d3_vol, std_d4_vol, std_d5_vol, std_d6_vol, std_d7_vol, std_d8_vol, std_d9_vol, std_d10_vol],
                                   'SEM': [sem_d1_vol, sem_d2_vol, sem_d3_vol, sem_d4_vol, sem_d5_vol, sem_d6_vol, sem_d7_vol, sem_d8_vol, sem_d9_vol, sem_d10_vol]})
summary_statistics.style.format({'Mean':'{0:,.2f}','Median':'{0:,.2f}','Variance':'{0:,.2f}','Standard Deviation':'{0:,.2f}'})

Unnamed: 0,Drug Regimen,Mean,Median,Mode,Variance,Standard Deviation,SEM
0,Capomulin,40.68,41.56,"ModeResult(mode=array([45.]), count=array([25]))",24.84,4.98,0.329346
1,Ceftamin,52.59,51.78,"ModeResult(mode=array([45.]), count=array([25]))",39.07,6.25,0.469821
2,Infubinol,52.88,51.82,"ModeResult(mode=array([45.]), count=array([25]))",42.89,6.55,0.492236
3,Ketapril,55.24,53.7,"ModeResult(mode=array([45.]), count=array([25]))",68.19,8.26,0.60386
4,Naftisol,54.33,52.51,"ModeResult(mode=array([45.]), count=array([25]))",65.82,8.11,0.596466
5,Placebo,54.03,52.29,"ModeResult(mode=array([45.]), count=array([25]))",60.83,7.8,0.581331
6,Propriva,52.37,50.91,"ModeResult(mode=array([45.]), count=array([25]))",42.01,6.48,0.514041
7,Ramicane,40.22,40.67,"ModeResult(mode=array([45.]), count=array([25]))",23.38,4.84,0.320955
8,Stelasyn,54.23,52.43,"ModeResult(mode=array([45.]), count=array([24]))",59.12,7.69,0.573111
9,Zoniferol,53.24,51.82,"ModeResult(mode=array([45.]), count=array([25]))",48.27,6.95,0.516398


In [25]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_regimen_group = clean_combined_df.groupby('Drug Regimen')
drug_regimen_group['Tumor Volume (mm3)'].describe()


# This method produces everything in a single groupby function


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Capomulin,230.0,40.675741,4.994774,23.343598,37.685933,41.557809,45.0,48.158209
Ceftamin,178.0,52.591172,6.268188,45.0,47.208427,51.776157,56.801438,68.923185
Infubinol,178.0,52.884795,6.567243,36.321346,47.312353,51.820584,57.314444,72.226731
Ketapril,188.0,55.235638,8.279709,45.0,48.232987,53.698743,60.870951,78.567014
Naftisol,186.0,54.331565,8.134708,45.0,47.285874,52.509285,59.963034,76.668817
Placebo,181.0,54.033581,7.821003,45.0,47.459053,52.288934,59.916934,73.212939
Propriva,160.0,52.368318,6.50216,45.0,47.107256,50.909965,56.259803,72.455421
Ramicane,228.0,40.216745,4.846308,22.050126,36.674635,40.673236,45.0,47.622816
Stelasyn,181.0,54.233149,7.710419,45.0,48.047139,52.431737,58.719297,75.12369
Zoniferol,182.0,53.236507,6.966589,45.0,47.337876,51.818479,57.954259,73.324432


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
