## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
mouse_metadata

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [3]:
study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [4]:
# Combine the data into a single dataset
combined_results = pd.merge(mouse_metadata, study_results, how="outer", on="Mouse ID").sort_values("Mouse ID", ascending=True, ignore_index=True)

# Display the data table for preview
combined_results.head(5)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,30,59.523197,1
1,a203,Infubinol,Female,20,23,35,61.93165,2
2,a203,Infubinol,Female,20,23,25,56.793208,1
3,a203,Infubinol,Female,20,23,20,55.173336,1
4,a203,Infubinol,Female,20,23,15,52.77787,1


In [5]:
# Checking the number of mice.
total_unique_mice = len(combined_results["Mouse ID"].unique())
total_unique_mice

249

In [6]:
# Getting the Mouse ID numbers that show up with duplicate Timepoints. 
combined_results["Duplicate"] = combined_results.duplicated(["Mouse ID", "Timepoint"])
duplicates = combined_results.loc[(combined_results["Duplicate"] == True)]
duplicates

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Duplicate
589,g989,Propriva,Female,21,26,5,47.570392,0,True
591,g989,Propriva,Female,21,26,10,49.880528,0,True
596,g989,Propriva,Female,21,26,0,45.0,0,True
597,g989,Propriva,Female,21,26,20,55.326122,1,True
598,g989,Propriva,Female,21,26,15,53.44202,0,True


In [7]:
# Optional: Display all the data for the duplicate mouse ID. 
duplicate_mouse_data = combined_results.loc[(combined_results["Mouse ID"] == "g989")]
duplicate_mouse_data


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Duplicate
586,g989,Propriva,Female,21,26,0,45.0,0,False
587,g989,Propriva,Female,21,26,15,51.325852,1,False
588,g989,Propriva,Female,21,26,5,48.786801,0,False
589,g989,Propriva,Female,21,26,5,47.570392,0,True
590,g989,Propriva,Female,21,26,10,51.745156,0,False
591,g989,Propriva,Female,21,26,10,49.880528,0,True
592,g989,Propriva,Female,21,26,35,62.57088,2,False
593,g989,Propriva,Female,21,26,30,59.082294,1,False
594,g989,Propriva,Female,21,26,25,56.045564,1,False
595,g989,Propriva,Female,21,26,20,54.65765,1,False


In [8]:
# Create a clean DataFrame by dropping all data associated with the duplicate mouse by its ID.
clean_mouse_data = combined_results.loc[(combined_results["Mouse ID"] != "g989")]
clean_mouse_data.drop("Duplicate", axis=1)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,30,59.523197,1
1,a203,Infubinol,Female,20,23,35,61.931650,2
2,a203,Infubinol,Female,20,23,25,56.793208,1
3,a203,Infubinol,Female,20,23,20,55.173336,1
4,a203,Infubinol,Female,20,23,15,52.777870,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,20,57.898778,2
1889,z969,Naftisol,Male,9,30,25,63.145652,2
1890,z969,Naftisol,Male,9,30,30,65.841013,3
1891,z969,Naftisol,Male,9,30,35,69.176246,4


In [9]:
# Checking the number of mice in the clean DataFrame.
unique_cleaned_mice = len(clean_mouse_data["Mouse ID"].unique())
unique_cleaned_mice

248

## Summary Statistics

In [66]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume 
# for each regimen
mean = clean_mouse_data[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"]).mean()
median = clean_mouse_data[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"]).median()
variance = clean_mouse_data[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"]).var()
std = clean_mouse_data[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"]).std()
sem = clean_mouse_data[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"]).sem()

# mean_tumor_vol = regimen_groups.mean()
# #mean_tumor_vol
# median_tumor_vol = regimen_groups.median()
# #median_tumor_vol
# var_tumor_vol = regimen_groups.var()
# #var_tumor_vol
# std_tumor_vol = regimen_groups.std()
# #std_tumor_vol
# sem_tumor_vol = regimen_groups.sem()
# #sem_tumor_vol
#regimen_groups


# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
summ_stats = pd.DataFrame({"Mean Tumor Volume": [mean],
                          "Median Tumor Volume": median,
                          "Variance Tumor Volume": variance,
                          "Standard Deviation Tumor Volume": std,
                          "SEM Tumor Volume": sem})
summ_stats

Unnamed: 0,Mean Tumor Volume,Median Tumor Volume,Variance Tumor Volume,Standard Deviation Tumor Volume,SEM Tumor Volume
0,Tumor Volume (mm3) Drug Regimen ...,"(T, u, m, o, r, , V, o, l, u, m, e, , (, m, ...","(T, u, m, o, r, , V, o, l, u, m, e, , (, m, ...","(T, u, m, o, r, , V, o, l, u, m, e, , (, m, ...","(T, u, m, o, r, , V, o, l, u, m, e, , (, m, ..."


In [61]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume
# for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
agg_regimen_groups = clean_mouse_data[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"]).agg({"Tumor Volume (mm3)":["mean", "median", "var", "std", "sem"]})
agg_regimen_groups

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [12]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [13]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [14]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [15]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [16]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [17]:
# Put treatments into a list for a for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [18]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [19]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [20]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [21]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
