## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_df = pd.merge(study_results, mouse_metadata, how='left', on='Mouse ID')
merged_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [2]:
# Checking the number of mice in the DataFrame.
len(merged_df['Mouse ID'].unique())

249

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicated_mouse_df = merged_df.loc[merged_df.duplicated(subset=["Mouse ID","Timepoint"]), "Mouse ID"].unique()
duplicated_mouse_df

array(['g989'], dtype=object)

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicated_mouse = merged_df.loc[merged_df["Mouse ID"] == "g989"]
duplicated_mouse

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.786801,0,Propriva,Female,21,26
360,g989,5,47.570392,0,Propriva,Female,21,26
620,g989,10,51.745156,0,Propriva,Female,21,26
681,g989,10,49.880528,0,Propriva,Female,21,26
815,g989,15,51.325852,1,Propriva,Female,21,26
869,g989,15,53.44202,0,Propriva,Female,21,26
950,g989,20,55.326122,1,Propriva,Female,21,26
1111,g989,20,54.65765,1,Propriva,Female,21,26


In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merged_df[merged_df["Mouse ID"].isin(duplicated_mouse_df) == False]
clean_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [6]:
# Checking the number of mice in the clean DataFrame.
len(clean_df["Mouse ID"].unique())

248

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, 
    #variance, standard deviation, and SEM 
    #of the tumor volume for each regimen

# This method is the most straightforward, 
    #creating multiple series and putting them all together at the end.

In [8]:
regimen_df = merged_df.set_index("Drug Regimen")
regimen_df

Unnamed: 0_level_0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Sex,Age_months,Weight (g)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Capomulin,b128,0,45.000000,0,Female,9,22
Ketapril,f932,0,45.000000,0,Male,15,29
Ketapril,g107,0,45.000000,0,Female,2,29
Ketapril,a457,0,45.000000,0,Female,11,30
Ketapril,c819,0,45.000000,0,Male,21,25
...,...,...,...,...,...,...,...
Capomulin,r944,45,41.581521,2,Male,12,25
Capomulin,u364,45,31.023923,3,Male,18,17
Ceftamin,p438,45,61.433892,1,Female,11,26
Placebo,x773,45,58.634971,4,Female,21,30


In [9]:
regimen_df = merged_df.groupby(['Drug Regimen'])
regimen_df.mean()

Unnamed: 0_level_0,Timepoint,Tumor Volume (mm3),Metastatic Sites,Age_months,Weight (g)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,21.565217,40.675741,0.713043,13.456522,19.965217
Ceftamin,19.747191,52.591172,1.179775,13.247191,27.398876
Infubinol,18.174157,52.884795,0.960674,16.230337,27.196629
Ketapril,19.707447,55.235638,1.297872,15.659574,27.861702
Naftisol,19.623656,54.331565,1.182796,12.0,27.166667
Placebo,18.674033,54.033581,1.441989,10.734807,27.928177
Propriva,16.863354,52.322552,0.975155,10.89441,27.043478
Ramicane,21.425439,40.216745,0.548246,10.684211,19.679825
Stelasyn,19.226519,54.233149,0.872928,12.78453,27.856354
Zoniferol,19.368132,53.236507,1.230769,12.598901,27.692308


In [10]:
tumor_df = regimen_df['Tumor Volume (mm3)']
volume_average = regimen_df['Tumor Volume (mm3)'].mean()
volume_average.head()

Drug Regimen
Capomulin    40.675741
Ceftamin     52.591172
Infubinol    52.884795
Ketapril     55.235638
Naftisol     54.331565
Name: Tumor Volume (mm3), dtype: float64

In [11]:
regimen_df = merged_df[["Drug Regimen", "Tumor Volume (mm3)"]]
average_volume = merged_df["Tumor Volume (mm3)"].mean()
median_volume = merged_df["Tumor Volume (mm3)"].median()
var_volume = merged_df["Tumor Volume (mm3)"].var()
std_volume = merged_df["Tumor Volume (mm3)"].std()
sem = merged_df["Tumor Volume (mm3)"].sem()
summary_df = pd.DataFrame({'Average Volume (mm3)': [average_volume],
                           'Median Volume (mm3)': [median_volume],
                           'Variance (mm3)': [var_volume],
                           'Standard Deviation (mm3)': [std_volume],
                           'Standard Error': [sem]})
summary_df

Unnamed: 0,Average Volume (mm3),Median Volume (mm3),Variance (mm3),Standard Deviation (mm3),Standard Error
0,50.448381,48.951474,79.116074,8.894722,0.204436


In [12]:
regimen_df["Tumor Volume (mm3)"].describe()

count    1893.000000
mean       50.448381
std         8.894722
min        22.050126
25%        45.000000
50%        48.951474
75%        56.292200
max        78.567014
Name: Tumor Volume (mm3), dtype: float64

In [13]:
grouped_regimen_df = merged_df.groupby(['Drug Regimen'])
describe_df = grouped_regimen_df["Tumor Volume (mm3)"].describe()

In [14]:

describe_df['Median'] = regimen_df["Tumor Volume (mm3)"].median()
describe_df['Standard Error'] = regimen_df["Tumor Volume (mm3)"].sem()
describe_df['Variance'] = regimen_df["Tumor Volume (mm3)"].var()

describe_df

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max,Median,Standard Error,Variance
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Capomulin,230.0,40.675741,4.994774,23.343598,37.685933,41.557809,45.0,48.158209,48.951474,0.204436,79.116074
Ceftamin,178.0,52.591172,6.268188,45.0,47.208427,51.776157,56.801438,68.923185,48.951474,0.204436,79.116074
Infubinol,178.0,52.884795,6.567243,36.321346,47.312353,51.820584,57.314444,72.226731,48.951474,0.204436,79.116074
Ketapril,188.0,55.235638,8.279709,45.0,48.232987,53.698743,60.870951,78.567014,48.951474,0.204436,79.116074
Naftisol,186.0,54.331565,8.134708,45.0,47.285874,52.509285,59.963034,76.668817,48.951474,0.204436,79.116074
Placebo,181.0,54.033581,7.821003,45.0,47.459053,52.288934,59.916934,73.212939,48.951474,0.204436,79.116074
Propriva,161.0,52.322552,6.50777,45.0,47.081086,50.854632,56.184664,72.455421,48.951474,0.204436,79.116074
Ramicane,228.0,40.216745,4.846308,22.050126,36.674635,40.673236,45.0,47.622816,48.951474,0.204436,79.116074
Stelasyn,181.0,54.233149,7.710419,45.0,48.047139,52.431737,58.719297,75.12369,48.951474,0.204436,79.116074
Zoniferol,182.0,53.236507,6.966589,45.0,47.337876,51.818479,57.954259,73.324432,48.951474,0.204436,79.116074


In [16]:
# summary1_df = pd.DataFrame({'Variance': [var],
#                             'Std. Error of Mean': [sem]})
# summary1_df
#                            'Median Volume (mm3)': [median_volume],
#                            'Variance (mm3)': [var_volume],
#                            'Standard Deviation (mm3)': [std_volume],
#                            'Standard Error': [sem]})

In [17]:
# grouped_test_df['Median'] = test_df["Tumor Volume (mm3)"].median()
# grouped_test_df['Standard Error'] = test_df["Tumor Volume (mm3)"].sem()
# grouped_test_df['Variance'] = test_df["Tumor Volume (mm3)"].var()

# grouped_test_df

In [None]:
#clean_df.sorted("Tumor Volume (mm3)", inplace = True)

In [None]:
# clean_df = clean_df[["Drug Regimen", "Tumor Volume (mm3)"]]
# clean_df = clean_df["Tumor Volume (mm3)"].mean()
# median_volume = clean_df["Tumor Volume (mm3)"].median()
# var_volume = clean_df["Tumor Volume (mm3)"].var()
# std_volume = clean_df["Tumor Volume (mm3)"].std()
# sem = clean_df["Tumor Volume (mm3)"].sem()
# summary1_df = pd.DataFrame({'Average Volume (mm3)': [average_volume],
#                            'Median Volume (mm3)': [median_volume],
#                            'Variance (mm3)': [var_volume],
#                            'Standard Deviation (mm3)': [std_volume],
#                            'Standard Error': [sem]})
# summary1_df

In [None]:
# Generate a summary statistics table of mean, 
    #median, variance, standard deviation, and SEM 
    #of the tumor volume for each regimen

# This method produces everything in a single groupby function.

## Bar Plots

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
