## Observations and Insights 

In [1]:
%matplotlib notebook

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
# Combine the data into a single dataset
mouse_study_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

In [3]:
# Display the data table for preview
print()
print(f"Screening Data for Squamouse Cell Carcinoma (SCC)")
mouse_study_df.head()


Screening Data for Squamouse Cell Carcinoma (SCC)


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [4]:
# Calculate number of rows prior to row-level deduplication
mouse_ttl_rows = len(mouse_study_df.index)
mouse_ttl_rows

1893

In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Use drop_duplicates with parameter subset, for keeping only last duplicated rows add
mouse_study_dedupe_df = mouse_study_df.drop_duplicates(subset=["Mouse ID","Timepoint"])

print()
print(f"Screening Data for Squamouse Cell Carcinoma (SCC) - Cleansed")
mouse_study_dedupe_df.head()


Screening Data for Squamouse Cell Carcinoma (SCC) - Cleansed


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [6]:
# Calculate number of rows after row-level deduplication
mouse_dedupe_ttl_rows = len(mouse_study_dedupe_df.index)
print(f"After deduplication, there are {mouse_dedupe_ttl_rows} rows in the dataset.")
print(f"The deduplication removed {mouse_ttl_rows - mouse_dedupe_ttl_rows} rows from the original data set.")

After deduplication, there are 1888 rows in the dataset.
The deduplication removed 5 rows from the original data set.


In [7]:
# Optional: Get all the data for the duplicate mouse ID. 
# The first line of code declares the following logic:
#     If the a row has values for columns Mouse ID and Timepoint that are identical to those of another row, 
#     assign row value = True.
# The second line of code instructs the dataframe to serve up unique instances of duplicate rows, with a default
#     of selecting all but the first row of each grouping of duplicate rows.

has_duplicates = mouse_study_df.duplicated(subset=["Mouse ID","Timepoint"])
duplicates = mouse_study_df[has_duplicates]

print()
print(f"Duplicate Rows Cleansed from Screening Data for Squamouse Cell Carcinoma (SCC)")
duplicates


Duplicate Rows Cleansed from Screening Data for Squamouse Cell Carcinoma (SCC)


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [8]:
# Checking the number of mice in the clean DataFrame.
# Should be equal to 249 mice identified with SCC tumor growth that were treated over 45 days
#     by different drug regimens

mice_in_study = mouse_study_dedupe_df["Mouse ID"].nunique()
mice_in_study 

249

In [9]:
# Identify Drug Regigmens
drugs_list = mouse_study_dedupe_df["Drug Regimen"].unique()
drugs_num = mouse_study_dedupe_df["Drug Regimen"].nunique()

print(f"In this study {mice_in_study} mice were identified with SCC tumor growth, treated for over 45 days by")
print(f"{drugs_num} different Drug Regimens for this study, including:\n")
print(*drugs_list, sep = ", ")
print()
print(f"Pymaceutical's drug of interest is Capomulin, which will be compared with other treatments.")

In this study 249 mice were identified with SCC tumor growth, treated for over 45 days by
10 different Drug Regimens for this study, including:

Ramicane, Capomulin, Infubinol, Placebo, Ceftamin, Stelasyn, Zoniferol, Ketapril, Propriva, Naftisol

Pymaceutical's drug of interest is Capomulin, which will be compared with other treatments.


In [10]:
# Examine data types for the cleaned DataFrame
mouse_study_dedupe_df.dtypes

Mouse ID               object
Drug Regimen           object
Sex                    object
Age_months              int64
Weight (g)              int64
Timepoint               int64
Tumor Volume (mm3)    float64
Metastatic Sites        int64
dtype: object

## Summary Statistics

In [11]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume.


In [12]:
# *****IMPORTANT*****
# Apparently, I may have answered this question differently than was instructed.  BUT my response is that what was 
#     instructed, an average of tumor size by drug could have led to badly skewed analysis.  Instead, the only way
#     to meaningfully analyze the data is first to take an average of tumor size measurements for each mouse, and
#     then to roll those averages into a set from which a second average value is calculated.  

#     Here's why in simple terms.  Say for a specific drug study there were 10 tumor size measurements for one mouse 
#     and 2 measurements for another mouse.  The instructed approach says to take an average for all tumor measurements.
#     This means that the instructed approach would allow the first mouse's tumor measurements to have 10x the 
#     weighting of the second mouse's tumor measurements.  This just doesn't make logical sense.  The question is trying 
#     to get at the average tumor size for each mouse.  My methodology - taking an average of averages - weights each
#     mouse equally in calculating an average tumor for the mouse population within each specific drug study.

#     I replicated this Groupby methodology for subseqent calculations: median, variance, standard deviation, and SEM.

In [13]:
# Calculate Average for Tumor Volume by Drug Regimen 
# Using a 2-variable Groupby operation with Drug Regimen as primary cluster and Mouse ID as secondary cluster,
#    and a second 2-variable Groupby operation by Drug Regimen as primary cluster and Tumor Volume as secondary field,
#    we were able to calculate average tumor size per mouse, and then roll up an average for each group of mice
#    subjected to a specific drug regimen.
# Method: groups using average of averages

avg_tumor_volume_by_drug = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"], as_index=False) \
     .mean() \
     .groupby("Drug Regimen")["Tumor Volume (mm3)"] \
     .mean()
avg_tumor_volume_by_drug

Drug Regimen
Capomulin    40.755487
Ceftamin     50.827485
Infubinol    51.383443
Ketapril     53.432527
Naftisol     52.499395
Placebo      52.540611
Propriva     50.613641
Ramicane     40.555988
Stelasyn     52.662319
Zoniferol    51.562955
Name: Tumor Volume (mm3), dtype: float64

In [14]:
# Show the base groupby object - with Drug Regimen master cluster and Mouse ID slave cluster
tumor_volume_by_drug_avg_for_mouse = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"]) \
     .mean()
tumor_volume_by_drug_avg_for_mouse

Unnamed: 0_level_0,Unnamed: 1_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Mouse ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capomulin,b128,9.0,22.0,22.5,41.963636,0.7
Capomulin,b742,7.0,21.0,22.5,40.083699,0.0
Capomulin,f966,16.0,17.0,10.0,36.505973,0.0
Capomulin,g288,3.0,19.0,22.5,41.990097,0.8
Capomulin,g316,22.0,22.0,22.5,44.613344,1.0
...,...,...,...,...,...,...
Zoniferol,q633,12.0,25.0,22.5,56.124896,0.9
Zoniferol,s337,14.0,27.0,22.5,51.515155,1.7
Zoniferol,w140,19.0,30.0,2.5,46.358976,0.0
Zoniferol,w575,16.0,28.0,22.5,56.627561,1.4


In [15]:
# Calculate Median for Tumor Volume by Drug Regimen
# Method: groups using median of averages
median_tumor_volume_by_drug = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"], as_index=False) \
     .mean() \
     .groupby("Drug Regimen")["Tumor Volume (mm3)"] \
     .median()
median_tumor_volume_by_drug

Drug Regimen
Capomulin    41.384825
Ceftamin     52.128041
Infubinol    51.897959
Ketapril     53.725479
Naftisol     53.353053
Placebo      53.392180
Propriva     49.923649
Ramicane     40.108578
Stelasyn     52.651204
Zoniferol    52.437635
Name: Tumor Volume (mm3), dtype: float64

In [16]:
# Calculate Variance for Tumor Volume by Drug Regimen
# Method: groups using variance of averages
var_tumor_volume_by_drug = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"], as_index=False) \
     .mean() \
     .groupby("Drug Regimen")["Tumor Volume (mm3)"] \
     .var()
var_tumor_volume_by_drug

Drug Regimen
Capomulin    10.529290
Ceftamin     14.040506
Infubinol    17.380408
Ketapril     24.086484
Naftisol     22.886774
Placebo      19.610351
Propriva     15.402512
Ramicane     10.256711
Stelasyn     22.147071
Zoniferol    16.782584
Name: Tumor Volume (mm3), dtype: float64

In [17]:
# Calculate Standard Deviation for Tumor Volume by Drug Regimen
# Method: groups using standard deviation of averages
std_tumor_volume_by_drug = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"], as_index=False) \
     .mean() \
     .groupby("Drug Regimen")["Tumor Volume (mm3)"] \
     .std()
std_tumor_volume_by_drug

Drug Regimen
Capomulin    3.244887
Ceftamin     3.747066
Infubinol    4.168982
Ketapril     4.907798
Naftisol     4.784012
Placebo      4.428358
Propriva     3.924603
Ramicane     3.202610
Stelasyn     4.706067
Zoniferol    4.096655
Name: Tumor Volume (mm3), dtype: float64

In [18]:
# Calculate Standard Error from the Mean (SEM) for Tumor Volume by Drug Regimen
# Method: groups using SEM of averages
sem_tumor_volume_by_drug = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"], as_index=False) \
     .mean() \
     .groupby("Drug Regimen")["Tumor Volume (mm3)"] \
     .sem()
sem_tumor_volume_by_drug

Drug Regimen
Capomulin    0.648977
Ceftamin     0.749413
Infubinol    0.833796
Ketapril     0.981560
Naftisol     0.956802
Placebo      0.885672
Propriva     0.784921
Ramicane     0.640522
Stelasyn     0.960622
Zoniferol    0.819331
Name: Tumor Volume (mm3), dtype: float64

In [19]:
# Calculate the number of mice (unique) involved in each Drug Regimen study
mice_count_by_drug = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"], as_index=False) \
     .mean() \
     .groupby("Drug Regimen")["Tumor Volume (mm3)"] \
     .count()
mice_count_by_drug

Drug Regimen
Capomulin    25
Ceftamin     25
Infubinol    25
Ketapril     25
Naftisol     25
Placebo      25
Propriva     25
Ramicane     25
Stelasyn     24
Zoniferol    25
Name: Tumor Volume (mm3), dtype: int64

In [20]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# I added mice count to the summary table.
# Assemble the resulting series into a single summary dataframe.
# Create data frame with obtained values
tumor_vol_summary_df = pd.DataFrame({"Tumor Vol Avg": avg_tumor_volume_by_drug, 
                                     "Tumor Vol Median": median_tumor_volume_by_drug,
                                     "Tumor Vol Var": var_tumor_volume_by_drug,
                                     "Tumor Vol StD": std_tumor_volume_by_drug,
                                     "Tumor Vol SEM": sem_tumor_volume_by_drug,
                                     "Mice Count": mice_count_by_drug})

print(f"Analysis of SCC Tumor Volume (mm3) by Drug Regimen\n")
print(f"Variance measures the spread between numbers in a data set.")
print(f"Standard Deviation (StD) provides a 'standard' way of knowing what is normal and what is not in a numbers spread.")
print(f"Standard Error from the Mean (SEM) is similar to StD, but tells how far a sample  mean might deviate from a population mean.")

tumor_vol_summary_df

Analysis of SCC Tumor Volume (mm3) by Drug Regimen

Variance measures the spread between numbers in a data set.
Standard Deviation (StD) provides a 'standard' way of knowing what is normal and what is not in a numbers spread.
Standard Error from the Mean (SEM) is similar to StD, but tells how far a sample  mean might deviate from a population mean.


Unnamed: 0_level_0,Tumor Vol Avg,Tumor Vol Median,Tumor Vol Var,Tumor Vol StD,Tumor Vol SEM,Mice Count
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capomulin,40.755487,41.384825,10.52929,3.244887,0.648977,25
Ceftamin,50.827485,52.128041,14.040506,3.747066,0.749413,25
Infubinol,51.383443,51.897959,17.380408,4.168982,0.833796,25
Ketapril,53.432527,53.725479,24.086484,4.907798,0.98156,25
Naftisol,52.499395,53.353053,22.886774,4.784012,0.956802,25
Placebo,52.540611,53.39218,19.610351,4.428358,0.885672,25
Propriva,50.613641,49.923649,15.402512,3.924603,0.784921,25
Ramicane,40.555988,40.108578,10.256711,3.20261,0.640522,25
Stelasyn,52.662319,52.651204,22.147071,4.706067,0.960622,24
Zoniferol,51.562955,52.437635,16.782584,4.096655,0.819331,25


In [21]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume 
#      for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
tumor_vol_summary_df2 = mouse_study_dedupe_df \
     .groupby(["Drug Regimen", "Mouse ID"], as_index=False) \
     .mean() \
     .groupby("Drug Regimen")["Tumor Volume (mm3)"] \
     .aggregate(["mean", "median", "var", "std", "sem", "count"])

print()
print(f"Analysis of SCC Tumor Volume (mm3) by Drug Regimen - Using .aggregate() Method")
tumor_vol_summary_df2


Analysis of SCC Tumor Volume (mm3) by Drug Regimen - Using .aggregate() Method


Unnamed: 0_level_0,mean,median,var,std,sem,count
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capomulin,40.755487,41.384825,10.52929,3.244887,0.648977,25
Ceftamin,50.827485,52.128041,14.040506,3.747066,0.749413,25
Infubinol,51.383443,51.897959,17.380408,4.168982,0.833796,25
Ketapril,53.432527,53.725479,24.086484,4.907798,0.98156,25
Naftisol,52.499395,53.353053,22.886774,4.784012,0.956802,25
Placebo,52.540611,53.39218,19.610351,4.428358,0.885672,25
Propriva,50.613641,49.923649,15.402512,3.924603,0.784921,25
Ramicane,40.555988,40.108578,10.256711,3.20261,0.640522,25
Stelasyn,52.662319,52.651204,22.147071,4.706067,0.960622,24
Zoniferol,51.562955,52.437635,16.782584,4.096655,0.819331,25


In [22]:
tumor_vol_summary_df2.dtypes

mean      float64
median    float64
var       float64
std       float64
sem       float64
count       int64
dtype: object

## Bar and Pie Charts

In [23]:
# Matplotlib's pyplot is the library that Pandas uses in its plot function. 
# Pandas' DataFrame.plot() is only a convenient shortcut. 

# "Matplotlib is a Python package that is widely used throughout the scientific Python community 
# to create high-quality and publication-ready graphics. It supports a wide range of raster and 
# vector graphics formats including PNG, PostScript, EPS, PDF and SVG.  Moreover, matplotlib is the 
# actual engine behind the plotting capabilities of both Pandas and plotnine packages. For example, 
# when we call the .plot method on Pandas data objects, we actually use the matplotlib package."

# https://datacarpentry.org/python-ecology-lesson/08-putting-it-all-together/index.html

In [24]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
ax = tumor_vol_summary_df2.plot.bar(y=["count"], color="r", align="center", title="Number of Mice for Each Drug Regimen Study")
ax.set_ylabel("Mice")
plt.xticks(rotation=45)
plt.gca().get_legend().remove()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [25]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.

# Assign dataframe column values to graph labels and values
drug_xlabel = tumor_vol_summary_df2.index.values.tolist() 
mice_count_by_drug = tumor_vol_summary_df2["count"]

fig = plt.figure(figsize = (7, 5)) 

# Create the bar plot
plt.bar(drug_xlabel, mice_count_by_drug, color ='red') 

# Label and further format
plt.xlabel("Drug Regimen") 
plt.ylabel("Mice") 
plt.title("Number of Mice for Each Drug Regimen Study") 
plt.xticks(rotation=45)
plt.tight_layout()
plt.show() 

<IPython.core.display.Javascript object>

In [None]:
ax2 = plt.subplots()

In [26]:
# Calculate sex distibution of mouse population
mouse_sex_df = mouse_study_dedupe_df[["Mouse ID", "Sex"]]
mouse_sex_dedupe_df = mouse_sex_df.drop_duplicates(subset=["Mouse ID"])
mice_count_by_sex = mouse_sex_dedupe_df["Sex"].value_counts()
mice_count_by_sex_df = pd.DataFrame(mice_count_by_sex).reset_index()
mice_count_by_sex_df = mice_count_by_sex_df.rename(columns={"index": "Sex", "Sex": "Count"})
mice_count_by_sex_df = mice_count_by_sex_df.sort_values(by=["Sex"]).reset_index(drop=True)
mice_count_by_sex_df

Unnamed: 0,Sex,Count
0,Female,124
1,Male,125


In [27]:
# Double-check on sex distibution of mouse population
mouse_sex_group = mouse_sex_dedupe_df.groupby("Sex").count()
mouse_sex_group_df = mouse_sex_group.rename(columns={"Mouse ID": "Count"})
mouse_sex_group_df = mouse_sex_group_df.reset_index()
mouse_sex_group_df

Unnamed: 0,Sex,Count
0,Female,124
1,Male,125


In [28]:
males = mice_count_by_sex_df.iloc[1,1]
females = mice_count_by_sex_df.iloc[0,1]
print(f"Male Mice = {males}")
print(f"Female Mice = {females}")

Male Mice = 125
Female Mice = 124


In [31]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
ax2 = mice_count_by_sex_df.plot.pie(y="Count", title="Sex of Mice Across All Regimen Studies")
ax2 = plt.axes()
# ax2.yaxis.label.set_visible(False)
ax2.get_legend().remove()

TypeError: RangeIndex.name must be a hashable type

In [None]:
mice_sex_all_studies_df["Sex"]

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30, 45, 10]
explode = (0, 0.1, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Extract a dataframe that includes only rows (records) of data for 4 promising drug regimens
# Use .isin() filter method with a "white" list of the 4 promising drugs (promising_drugs)

# Put treatments into a list for for loop (and later for plot labels)
promising_drugs = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

mouse_study_promising_drugs_df = mouse_study_dedupe_df.loc[mouse_study_dedupe_df["Drug Regimen"].isin(promising_drugs)]

print()
print(f"Source Data for Only the Four Most Promising Drug Therapies")
mouse_study_promising_drugs_df.head()

In [None]:
#Check data types
mouse_study_promising_drugs_df.dtypes

In [None]:
# INSTRUCTION: Start by getting the last (greatest) timepoint for each mouse

# Extract latest (max) values for the Timepoint column by Drug Regimen (master grouping) and Mouse ID (grouping next level down)
# Then reset the index (Drug Regimen and Mouse ID)
max_time_by_mouse_for_prom_drug = mouse_study_dedupe_df \
                                            .groupby(["Drug Regimen", "Mouse ID"])["Timepoint"].max() \
                                            .reset_index()

print()
print(f"Maximum Timepoints for Each Mouse in Each of the Drug Regimen Studies")
max_time_by_mouse_for_prom_drug.head()

In [None]:
# INSTRUCTION: Merge this group df with the original dataframe to get the tumor volume at the last timepoint

# Used inner merge method to extract only rows of master dataframe named "Source Data for Only
#     the Four Most Promising Drug Therapies" with identically matching rows (all 3 columns) of
#     the dataframe named "Maximum Timepoints for Each Mouse in Each of the Drug Regimen Studies"

df1 = mouse_study_promising_drugs_df
df2 = max_time_by_mouse_for_prom_drug
df = df1.merge(df2, on=["Drug Regimen","Mouse ID", "Timepoint"])

# Rename column header Timepoint to Max Timepoint
df = df.rename(columns={"Timepoint": "Max Timepoint", "Age_months": "Age (mos)", "Tumor Volume (mm3)": "Final TumorVol (mm3)"})

# Move Drug Regimen column to first position in table
df = df[["Drug Regimen", "Mouse ID", "Final TumorVol (mm3)", "Metastatic Sites", "Sex", "Age (mos)", \
         "Weight (g)", "Max Timepoint"]]

# Sort by Drug Regimen and Mouse ID, in that order of importance
df = df.sort_values(by=["Drug Regimen", "Mouse ID"]) #.reset_index()

# Rename dataframe
final_tum_vol_by_mouse_by_prom_drug_df = df

print()
print(f"Final Tumor Volume for Each Mouse in Each of the Drug Regimen Studies")
final_tum_vol_by_mouse_by_prom_drug_df.head()

In [None]:
# A somewhat more elegant way of presenting the same information as above
# Groupby insists on calculating some value (min, max, mean, var, std, count, etc.) in order to cluster
# But I didn't need to calcuate any of these values for the purposes of presentation.
# However, all rows show records only for a single point in time, the maximum Timepoint.
# All string value columns are clustered as an index of sorts.
# The remaining columns are numerical values.
# And each row (as clustered by drug and mouse) is unique.
# As such, running a mean, median, mode, max, min calculation simply returns the existing value.
# So I chose mean out of convenience.
# And this groupby output format provides an elegant view of the data organized by the nested variables /
# column values for Drug Regimen and Mouse ID

final_tum_vol_by_mouse_by_prom_drug_group = final_tum_vol_by_mouse_by_prom_drug_df.groupby(["Drug Regimen", "Mouse ID"]).mean()
final_tum_vol_by_mouse_by_prom_drug_group

In [None]:
# For convenience, I shorten / rename DataFrame that is product of filtering, sorting, refining, etc. 
# These hold effectively the same information, one as a dataframe, the other as a groupby object

study1_df = final_tum_vol_by_mouse_by_prom_drug_df
study2_df = final_tum_vol_by_mouse_by_prom_drug_group

In [None]:
# INSTRUCTION: Put treatments into a list for for loop (and later for plot labels)
# COMMENT: I had already done this earlier, so I am simply printing the named variable and a list of its values

print(promising_drugs)

In [None]:
# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

In [None]:
study_quartiles_group = study1_df.groupby("Drug Regimen")["Final TumorVol (mm3)"].quantile([.25,.5,.75])
study_quartiles_df = pd.DataFrame(study_quartiles_group)
study_quartiles_df = study_quartiles_df.rename(columns={"Final TumorVol (mm3)": "Final TumorVol Quartile Cuts"})
study_quartiles_df

In [None]:
# Define quartiles variable using .quantile() method
quartiles = volume.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

In [None]:
study_quartiles_df[index==.25]

In [None]:
study1_df[study1_df[["Drug Regimen"] == "Capomulin"]]

In [None]:
volume = study2_df["Tumor Volume (mm3"]

# Define quartiles variable using .quantile() method
quartiles = volume.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

In [None]:
volume = study2_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].

In [None]:
volume = study["Tumor Volume (mm3"]

# Define quartiles variable using .quantile() method
quartiles = volume.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of temperatures is: {lowerq}")
print(f"The upper quartile of temperatures is: {upperq}")
print(f"The interquartile range of temperatures is: {iqr}")
print(f"The the median of temperatures is: {quartiles[0.5]} ")


study_df = study_df.loc[study_df['Timepoint'] == max_timepoint]


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
