## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
# merged_inner = pd.merge(left=survey_sub, right=species_sub, left_on='species_id', right_on='species_id')
masterDf = pd.merge(left=mouse_metadata,right=study_results, how='outer')
print(masterDf.head())
print(f"{masterDf.shape}")

  Mouse ID Drug Regimen   Sex  Age_months  Weight (g)  Timepoint  \
0     k403     Ramicane  Male          21          16          0   
1     k403     Ramicane  Male          21          16          5   
2     k403     Ramicane  Male          21          16         10   
3     k403     Ramicane  Male          21          16         15   
4     k403     Ramicane  Male          21          16         20   

   Tumor Volume (mm3)  Metastatic Sites  
0           45.000000                 0  
1           38.825898                 0  
2           35.014271                 1  
3           34.223992                 1  
4           32.997729                 1  
(1893, 8)


In [3]:
duplicateRowsDf = masterDf[masterDf[["Mouse ID","Timepoint"]].duplicated(keep='first')]
print(f"{duplicateRowsDf.shape}")

(5, 8)


In [33]:
# check if there is any missing/null value
masterDf.isnull().values.any()

# number of unique mice
nMice = len(masterDf["Mouse ID"].unique())

# drop duplicates
masterDf = masterDf.drop_duplicates(subset=["Mouse ID", "Timepoint"], keep='first')
masterDf.head()
# print(f"{masterDf.shape}")

#1893 rows - 5 duplicate rows = 1888 rows (matches!)
 
nMice

249

## Summary Statistics

In [22]:
# unique entries for gender
gender = masterDf["Sex"].unique()

# add two columns to track Male/Female for groupby conditional count later
masterDf["Male"] = masterDf["Sex"].apply(lambda x: 1 if x == "Male" else 0)
masterDf["Female"] = masterDf["Sex"].apply(lambda x: 1 if x == "Female" else 0)
masterDf.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Male,Female
0,k403,Ramicane,Male,21,16,0,45.0,0,1,0
1,k403,Ramicane,Male,21,16,5,38.825898,0,1,0
2,k403,Ramicane,Male,21,16,10,35.014271,1,1,0
3,k403,Ramicane,Male,21,16,15,34.223992,1,1,0
4,k403,Ramicane,Male,21,16,20,32.997729,1,1,0


In [24]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# at each time point, get mouse and gender count to see alive mouse after each timepoint
summaryStatsByTimePoint = masterDf[["Drug Regimen","Timepoint","Tumor Volume (mm3)","Mouse ID","Male","Female"]]\
                .groupby(["Drug Regimen","Timepoint"]).agg({"Tumor Volume (mm3)":['mean','median','var','std','sem'],\
                                                            "Mouse ID":'count',"Male": 'sum', "Female": 'sum'})
summaryStatsByTimePoint.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Mouse ID,Male,Female
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,var,std,sem,count,sum,sum
Drug Regimen,Timepoint,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Capomulin,0,45.000000,45.000000,0.000000,0.000000,0.000000,25,12,13
Capomulin,5,44.266086,45.597064,5.030889,2.242964,0.448593,25,12,13
Capomulin,10,43.084291,43.421014,12.344133,3.513422,0.702684,25,12,13
Capomulin,15,42.064317,42.798160,16.878693,4.108369,0.838617,24,11,13
Capomulin,20,40.716325,40.716428,19.035028,4.362915,0.909731,23,10,13
...,...,...,...,...,...,...,...,...,...
Zoniferol,25,55.432935,55.676604,5.808348,2.410052,0.602513,16,6,10
Zoniferol,30,57.713531,57.419744,9.601024,3.098552,0.800043,15,5,10
Zoniferol,35,60.089372,60.365832,10.876760,3.297993,0.881426,14,4,10
Zoniferol,40,62.916692,62.274543,13.958456,3.736102,0.998515,14,4,10


In [27]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# aggregated across all timepoints

summaryStats= masterDf[["Drug Regimen","Tumor Volume (mm3)"]]\
                .groupby(["Drug Regimen"]).agg({"Tumor Volume (mm3)":['mean','median','var','std','sem']})
summaryStats


Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [34]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
genderDf = masterDf[["Mouse ID","Sex"]]
genderDf = genderDf.drop_duplicates(keep='first')
genderStats = genderDf.groupby(["Sex"]).count()
genderStats

Unnamed: 0_level_0,Mouse ID
Sex,Unnamed: 1_level_1
Female,124
Male,125


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
