## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
# Combine the data into a single dataset
mouse_study_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

In [3]:
# Display the data table for preview
mouse_study_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [4]:
# Calculate number of rows prior to row-level deduplication
mouse_ttl_rows = len(mouse_study_df.index)
mouse_ttl_rows

1893

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 



In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Use drop_duplicates with parameter subset, for keeping only last duplicated rows add
mouse_study_dedupe_df = mouse_study_df.drop_duplicates(subset=["Mouse ID","Timepoint"])
mouse_study_dedupe_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [6]:
# Calculate number of rows after row-level deduplication
mouse_dedupe_ttl_rows = len(mouse_study_dedupe_df.index)
print(f"After deduplication, there are {mouse_dedupe_ttl_rows} rows in the dataset.")
print(f"The deduplication removed {mouse_ttl_rows - mouse_dedupe_ttl_rows} rows from the original data set.")

After deduplication, there are 1888 rows in the dataset.
The deduplication removed 5 rows from the original data set.


In [7]:
# Checking the number of mice in the clean DataFrame.
mouse_study_dedupe_df["Mouse ID"].nunique()

249

In [8]:
# Identify Drug Regigmens
drugs_list = mouse_study_dedupe_df["Drug Regimen"].unique()
drugs_num = mouse_study_dedupe_df["Drug Regimen"].nunique()
print(f"There are {drugs_num} Drug Regimens as part of this study, including:")
print(*drugs_list, sep = ", ")


There are 10 Drug Regimens as part of this study, including:
Ramicane, Capomulin, Infubinol, Placebo, Ceftamin, Stelasyn, Zoniferol, Ketapril, Propriva, Naftisol


In [9]:
# Examine data types for the cleaned DataFrame
mouse_study_dedupe_df.dtypes

Mouse ID               object
Drug Regimen           object
Sex                    object
Age_months              int64
Weight (g)              int64
Timepoint               int64
Tumor Volume (mm3)    float64
Metastatic Sites        int64
dtype: object

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume.


In [10]:
# Calculate Average for Tumor Volume by Drug Regimen 
# Using a 2-variable Groupby operation with Drug Regimen as primary cluster and Mouse ID as secondary cluster,
#    and a second 2-variable Groupby operation by Drug Regimen as primary cluster and Tumor Volume as secondary field,
#    we were able to calculate average tumor size per mouse, and then roll up an average for each group of mice
#    subjected to a specific drug regimen.
# Method: groups using average of averages
# We replicated this Groupby methodology for subseqent calculations: median, variance, standard deviation, and SEM.

avg_tumor_volume_by_drug = mouse_study_dedupe_df.groupby(["Drug Regimen", "Mouse ID"], as_index=False).mean().groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
avg_tumor_volume_by_drug

Drug Regimen
Capomulin    40.755487
Ceftamin     50.827485
Infubinol    51.383443
Ketapril     53.432527
Naftisol     52.499395
Placebo      52.540611
Propriva     50.613641
Ramicane     40.555988
Stelasyn     52.662319
Zoniferol    51.562955
Name: Tumor Volume (mm3), dtype: float64

In [11]:
# Calculate Median for Tumor Volume by Drug Regimen
# Method: groups using median of medians
median_tumor_volume_by_drug = mouse_study_dedupe_df.groupby(["Drug Regimen", "Mouse ID"], as_index=False).median().groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
median_tumor_volume_by_drug

Drug Regimen
Capomulin    40.705170
Ceftamin     51.462314
Infubinol    51.346139
Ketapril     52.599206
Naftisol     52.419806
Placebo      51.471561
Propriva     49.481949
Ramicane     40.014505
Stelasyn     52.546192
Zoniferol    51.653739
Name: Tumor Volume (mm3), dtype: float64

In [12]:
# Calculate Variance for Tumor Volume by Drug Regimen
# Method: groups using variance of averages
var_tumor_volume_by_drug = mouse_study_dedupe_df.groupby(["Drug Regimen", "Mouse ID"], as_index=False).mean().groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
var_tumor_volume_by_drug

Drug Regimen
Capomulin    10.529290
Ceftamin     14.040506
Infubinol    17.380408
Ketapril     24.086484
Naftisol     22.886774
Placebo      19.610351
Propriva     15.402512
Ramicane     10.256711
Stelasyn     22.147071
Zoniferol    16.782584
Name: Tumor Volume (mm3), dtype: float64

In [13]:
# Calculate Standard Deviation for Tumor Volume by Drug Regimen
# Method: groups using standard deviation of averages
std_tumor_volume_by_drug = mouse_study_dedupe_df.groupby(["Drug Regimen", "Mouse ID"], as_index=False).mean().groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
std_tumor_volume_by_drug

Drug Regimen
Capomulin    3.244887
Ceftamin     3.747066
Infubinol    4.168982
Ketapril     4.907798
Naftisol     4.784012
Placebo      4.428358
Propriva     3.924603
Ramicane     3.202610
Stelasyn     4.706067
Zoniferol    4.096655
Name: Tumor Volume (mm3), dtype: float64

In [14]:
# Calculate Standard Error from the Mean (SEM) for Tumor Volume by Drug Regimen
# Method: groups using SEM of averages
sem_tumor_volume_by_drug = mouse_study_dedupe_df.groupby(["Drug Regimen", "Mouse ID"], as_index=False).mean().groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()
sem_tumor_volume_by_drug

Drug Regimen
Capomulin    0.648977
Ceftamin     0.749413
Infubinol    0.833796
Ketapril     0.981560
Naftisol     0.956802
Placebo      0.885672
Propriva     0.784921
Ramicane     0.640522
Stelasyn     0.960622
Zoniferol    0.819331
Name: Tumor Volume (mm3), dtype: float64

In [15]:
# Assemble the resulting series into a single summary dataframe.
# Create data frame with obtained values
tumor_vol_summary_df = pd.DataFrame({"Tumor Vol Avg": avg_tumor_volume_by_drug, 
                                     "Tumor Vol Median": median_tumor_volume_by_drug,
                                     "Tumor Vol Var": var_tumor_volume_by_drug,
                                     "Tumor Vol StD": std_tumor_volume_by_drug,
                                     "Tumor Vol SEM": sem_tumor_volume_by_drug})

print(f"Analysis of Tumor Volume (mm3) by Drug Regimen\n")
print(f"Variance measures the spread between numbers in a data set.")
print(f"Standard Deviation (StD) provides a 'standard' way of knowing what is normal and what is not in a numbers spread.")
print(f"Standard Error from the Mean (SEM) is similar to StD, but tells how far a sample  mean might deviate from a population mean.")

tumor_vol_summary_df

Analysis of Tumor Volume (mm3) by Drug Regimen

Variance measures the spread between numbers in a data set.
Standard Deviation (StD) provides a 'standard' way of knowing what is normal and what is not in a numbers spread.
Standard Error from the Mean (SEM) is similar to StD, but tells how far a sample  mean might deviate from a population mean.


Unnamed: 0_level_0,Tumor Vol Avg,Tumor Vol Median,Tumor Vol Var,Tumor Vol StD,Tumor Vol SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.755487,40.70517,10.52929,3.244887,0.648977
Ceftamin,50.827485,51.462314,14.040506,3.747066,0.749413
Infubinol,51.383443,51.346139,17.380408,4.168982,0.833796
Ketapril,53.432527,52.599206,24.086484,4.907798,0.98156
Naftisol,52.499395,52.419806,22.886774,4.784012,0.956802
Placebo,52.540611,51.471561,19.610351,4.428358,0.885672
Propriva,50.613641,49.481949,15.402512,3.924603,0.784921
Ramicane,40.555988,40.014505,10.256711,3.20261,0.640522
Stelasyn,52.662319,52.546192,22.147071,4.706067,0.960622
Zoniferol,51.562955,51.653739,16.782584,4.096655,0.819331


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


In [19]:
tumor_vol_summary_df.loc[["Capomulin"]]

Unnamed: 0_level_0,Tumor Vol Avg,Tumor Vol Median,Tumor Vol Var,Tumor Vol StD,Tumor Vol SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.755487,40.70517,10.52929,3.244887,0.648977


In [None]:
tumor_vol_summary_df.loc[["Capomulin"]]

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.



In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
