## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata,study_results,on="Mouse ID", how="left")

# Display the data table for preview
combined_df.head()

In [None]:
# Checking the number of mice.
len(combined_df['Mouse ID'])

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicateRows_DF = combined_df[combined_df.duplicated()]
duplicateRows_DF['Mouse ID']

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicateRows_DF = combined_df[combined_df.duplicated()]
duplicateRows_DF

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_combined_df = combined_df.drop_duplicates(keep='first',inplace=False)


In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_combined_df['Mouse ID'])

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_regimen_group = clean_combined_df.groupby('Drug Regimen')
mean_drugs = drug_regimen_group['Tumor Volume (mm3)'].mean()

drug_list = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']

# All data for each drug regimen
d1 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Capomulin")]
d2 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Ceftamin")]
d3 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Infubinol")]
d4 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Ketapril")]
d5 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Naftisol")]
d6 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Placebo")]
d7 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Propriva")]
d8 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Ramicane")]
d9 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Stelasyn")]
d10 = clean_combined_df[clean_combined_df['Drug Regimen'].str.contains("Zoniferol")]

# All tumor volumes for each drug regimen
d1_vol = d1['Tumor Volume (mm3)']
d2_vol = d2['Tumor Volume (mm3)']
d3_vol = d3['Tumor Volume (mm3)']
d4_vol = d4['Tumor Volume (mm3)']
d5_vol = d5['Tumor Volume (mm3)']
d6_vol = d6['Tumor Volume (mm3)']
d7_vol = d7['Tumor Volume (mm3)']
d8_vol = d8['Tumor Volume (mm3)']
d9_vol = d9['Tumor Volume (mm3)']
d10_vol = d10['Tumor Volume (mm3)']

# All mean tumor volumes for each drug regimen
mean_d1_vol = np.mean(d1_vol)
mean_d2_vol = np.mean(d2_vol)
mean_d3_vol = np.mean(d3_vol)
mean_d4_vol = np.mean(d4_vol)
mean_d5_vol = np.mean(d5_vol)
mean_d6_vol = np.mean(d6_vol)
mean_d7_vol = np.mean(d7_vol)
mean_d8_vol = np.mean(d8_vol)
mean_d9_vol = np.mean(d9_vol)
mean_d10_vol = np.mean(d10_vol)

# All median tumor volumes for each drug regimen
median_d1_vol = np.median(d1_vol)
median_d2_vol = np.median(d2_vol)
median_d3_vol = np.median(d3_vol)
median_d4_vol = np.median(d4_vol)
median_d5_vol = np.median(d5_vol)
median_d6_vol = np.median(d6_vol)
median_d7_vol = np.median(d7_vol)
median_d8_vol = np.median(d8_vol)
median_d9_vol = np.median(d9_vol)
median_d10_vol = np.median(d10_vol)

# All mode tumor volumes for each drug regimen
mode_d1_vol = st.mode(d1_vol)
mode_d2_vol = st.mode(d2_vol)
mode_d3_vol = st.mode(d3_vol)
mode_d4_vol = st.mode(d4_vol)
mode_d5_vol = st.mode(d5_vol)
mode_d6_vol = st.mode(d6_vol)
mode_d7_vol = st.mode(d7_vol)
mode_d8_vol = st.mode(d8_vol)
mode_d9_vol = st.mode(d9_vol)
mode_d10_vol = st.mode(d10_vol)

# All variance tumor volumes for each drug regimen
var_d1_vol = np.var(d1_vol,ddof = 0)
var_d2_vol = np.var(d2_vol,ddof = 0)
var_d3_vol = np.var(d3_vol,ddof = 0)
var_d4_vol = np.var(d4_vol,ddof = 0)
var_d5_vol = np.var(d5_vol,ddof = 0)
var_d6_vol = np.var(d6_vol,ddof = 0)
var_d7_vol = np.var(d7_vol,ddof = 0)
var_d8_vol = np.var(d8_vol,ddof = 0)
var_d9_vol = np.var(d9_vol,ddof = 0)
var_d10_vol = np.var(d10_vol,ddof = 0)

# All standard deviations tumor volumes for each drug regimen
std_d1_vol = np.std(d1_vol,ddof = 0)
std_d2_vol = np.std(d2_vol,ddof = 0)
std_d3_vol = np.std(d3_vol,ddof = 0)
std_d4_vol = np.std(d4_vol,ddof = 0)
std_d5_vol = np.std(d5_vol,ddof = 0)
std_d6_vol = np.std(d6_vol,ddof = 0)
std_d7_vol = np.std(d7_vol,ddof = 0)
std_d8_vol = np.std(d8_vol,ddof = 0)
std_d9_vol = np.std(d9_vol,ddof = 0)
std_d10_vol = np.std(d10_vol,ddof = 0)

# All SEM tumor volumes for each drug regimen
sem_d1_vol = sem(d1_vol)
sem_d2_vol = sem(d2_vol)
sem_d3_vol = sem(d3_vol)
sem_d4_vol = sem(d4_vol)
sem_d5_vol = sem(d5_vol)
sem_d6_vol = sem(d6_vol)
sem_d7_vol = sem(d7_vol)
sem_d8_vol = sem(d8_vol)
sem_d9_vol = sem(d9_vol)
sem_d10_vol = sem(d10_vol)


drug_regimen_group['Tumor Volume (mm3)'].mean()
mean_drugs
# This method is the most straighforward, creating multiple series and putting them all together at the end.
summary_statistics = pd.DataFrame({'Drug Regimen': drug_list,
                                   'Mean': [mean_d1_vol, mean_d2_vol, mean_d3_vol, mean_d4_vol, mean_d5_vol, mean_d6_vol, mean_d7_vol, mean_d8_vol, mean_d9_vol, mean_d10_vol],
                                   'Median': [median_d1_vol, median_d2_vol, median_d3_vol, median_d4_vol, median_d5_vol, median_d6_vol, median_d7_vol, median_d8_vol, median_d9_vol, median_d10_vol],
                                   'Mode': [mode_d1_vol, mode_d2_vol, mode_d3_vol, mode_d4_vol, mode_d5_vol, mode_d6_vol, mode_d7_vol, mode_d8_vol, mode_d9_vol, mode_d10_vol],
                                   'Variance': [var_d1_vol, var_d2_vol, var_d3_vol, var_d4_vol, var_d5_vol, var_d6_vol, var_d7_vol, var_d8_vol, var_d9_vol, var_d10_vol],
                                   'Standard Deviation': [std_d1_vol, std_d2_vol, std_d3_vol, std_d4_vol, std_d5_vol, std_d6_vol, std_d7_vol, std_d8_vol, std_d9_vol, std_d10_vol],
                                   'SEM': [sem_d1_vol, sem_d2_vol, sem_d3_vol, sem_d4_vol, sem_d5_vol, sem_d6_vol, sem_d7_vol, sem_d8_vol, sem_d9_vol, sem_d10_vol]})
summary_statistics.style.format({'Mean':'{0:,.2f}','Median':'{0:,.2f}','Variance':'{0:,.2f}','Standard Deviation':'{0:,.2f}'})

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_regimen_group = clean_combined_df.groupby('Drug Regimen')
drug_regimen_group['Tumor Volume (mm3)'].agg(['mean', 'median', 'var', 'std', sem]).style.format({'mean':'{0:,.2f}','median':'{0:,.2f}','var':'{0:,.2f}','std':'{0:,.2f}'})


# This method produces everything in a single groupby function


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
drug_regimen_group = clean_combined_df.groupby('Drug Regimen')
count = drug_regimen_group['Mouse ID'].count()
count.plot(kind = 'bar', rot = 45, color = 'purple', grid = 'on', title = 'Total Number of Mice per Treatment')

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
plt.bar(drug_list,count, color = 'purple')
plt.title('Total Number of Mice per Treatment')
plt.xticks(rotation = 45)
plt.grid('on')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
male_female = clean_combined_df.groupby('Sex')
count_mf = male_female['Mouse ID'].count()

count_mf.plot(kind = 'pie',startangle=90, title = 'Female vs. Male Distribution',
              autopct="%1.1f%%", colors = ["pink","lightblue"], shadow=True, label = " ",
             figsize=(5,5))


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
colors = ["pink","lightblue"]
explode = (0,0)

plt.pie(count_mf, explode=explode, labels= ['Female','Male'], colors=colors, autopct="%1.1f%%", shadow=True, startangle=90)
plt.title('Female vs. Male Distribution')
plt.axis("equal")


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse
mouse_final_tumor_vol = clean_combined_df.drop_duplicates(subset = 'Mouse ID',keep='last',inplace=False)

# capomulin_vol_final = mouse_final_tumor_vol.loc[mouse_final_tumor_vol['Drug Regimen'] == 'Capomulin', 'Tumor Volume (mm3)']
# ramicane_vol_final = mouse_final_tumor_vol.loc[mouse_final_tumor_vol['Drug Regimen'] == 'Ramicane', 'Tumor Volume (mm3)']
# infubinol_vol_final = mouse_final_tumor_vol.loc[mouse_final_tumor_vol['Drug Regimen'] == 'Infubinol', 'Tumor Volume (mm3)']
# ceftamin_vol_final = mouse_final_tumor_vol.loc[mouse_final_tumor_vol['Drug Regimen'] == 'Ceftamin', 'Tumor Volume (mm3)']

clean_combined_df['Final Tumor Volume(mm3)'] = mouse_final_tumor_vol['Tumor Volume (mm3)']
clean_combined_df_with_final_vol = clean_combined_df.fillna('--')
clean_combined_df_with_final_vol.head(50)
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drug_list = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
capomulin_vols_final = clean_combined_df.loc[clean_combined_df['Drug Regimen'] == 'Capomulin', 'Tumor Volume (mm3)']
ramicane_vols_final = clean_combined_df.loc[clean_combined_df['Drug Regimen'] == 'Ramicane', 'Tumor Volume (mm3)']
infubinol_vols_final = clean_combined_df.loc[clean_combined_df['Drug Regimen'] == 'Infubinol', 'Tumor Volume (mm3)']
ceftamin_vols_final = clean_combined_df.loc[clean_combined_df['Drug Regimen'] == 'Ceftamin', 'Tumor Volume (mm3)']



quartiles = capomulin_vols_final.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

quartiles2 = ramicane_vols_final.quantile([.25,.5,.75])
lowerq2 = quartiles2[0.25]
upperq2 = quartiles2[0.75]
iqr2 = upperq2-lowerq2
lower_bound2 = lowerq2 - (1.5*iqr)
upper_bound2 = upperq2 + (1.5*iqr)

quartiles3 = infubinol_vols_final.quantile([.25,.5,.75])
lowerq3 = quartiles3[0.25]
upperq3 = quartiles3[0.75]
iqr3 = upperq3-lowerq3
lower_bound3 = lowerq3 - (1.5*iqr)
upper_bound3 = upperq3 + (1.5*iqr)

quartiles4 = ceftamin_vols_final.quantile([.25,.5,.75])
lowerq4 = quartiles4[0.25]
upperq4 = quartiles4[0.75]
iqr4 = upperq4-lowerq4
lower_bound4 = lowerq4 - (1.5*iqr)
upper_bound4 = upperq4 + (1.5*iqr)


print(f"The lower quartile of the tumor volume for Capomulin is: {round(lowerq,2)}")
print(f"The upper quartile of the tumor volume for Capomulin is: {round(upperq,2)}")
print(f"The interquartile range of the tumor volume for Capomulin is: {round(iqr,2)}")
print(f"Values below {round(lower_bound,2)} could be outliers.")
print(f"Values above {round(upper_bound,2)} could be outliers.\n")

print(f"The lower quartile of the tumor volume for Ramicane is: {round(lowerq2,2)}")
print(f"The upper quartile of the tumor volume for Ramicane is: {round(upperq2,2)}")
print(f"The interquartile range of the tumor volume for Ramicane is: {round(iqr2,2)}")
print(f"Values below {round(lower_bound2,2)} could be outliers.")
print(f"Values above {round(upper_bound2,2)} could be outliers.\n")

print(f"The lower quartile of the tumor volume for Infubinol is: {round(lowerq3,2)}")
print(f"The upper quartile of the tumor volume for Infubinol is: {round(upperq3,2)}")
print(f"The interquartile range of the tumor volume for Infubinol is: {round(iqr3,2)}")
print(f"Values below {round(lower_bound3,2)} could be outliers.")
print(f"Values above {round(upper_bound3,2)} could be outliers.\n")

print(f"The lower quartile of the tumor volume for Ceftamin is: {round(lowerq4,2)}")
print(f"The upper quartile of the tumor volume for Ceftamin is: {round(upperq4,2)}")
print(f"The interquartile range of the tumor volume for Ceftamin is: {round(iqr4,2)}")
print(f"Values below {round(lower_bound4,2)} could be outliers.")
print(f"Values above {round(upper_bound4,2)} could be outliers.\n")

# Determine outliers using upper and lower bounds
cap = clean_combined_df[clean_combined_df['Drug Regimen'].isin(['Capomulin'])]
cap[cap['Tumor Volume (mm3)'] < lower_bound]
cap[cap['Tumor Volume (mm3)'] > upper_bound]

ram = clean_combined_df[clean_combined_df['Drug Regimen'].isin(['Ramicane '])]
ram[ram['Tumor Volume (mm3)'] < lower_bound2]
ram[ram['Tumor Volume (mm3)'] > upper_bound2]

inf = clean_combined_df[clean_combined_df['Drug Regimen'].isin(['Infubinol'])]
inf[inf['Tumor Volume (mm3)'] < lower_bound3]
inf[inf['Tumor Volume (mm3)'] > upper_bound3]

cef = clean_combined_df[clean_combined_df['Drug Regimen'].isin(['Ceftamin'])]
cef[cef['Tumor Volume (mm3)'] < lower_bound4]
cef_UO = cef[cef['Tumor Volume (mm3)'] > upper_bound4]
print(f'The upper bound outliers are:\n{cef_UO}')
# outlier1 = capomulin_vols_final[capomulin_vols_final < round(lower_bound,2)]
# clean_combined_df.loc[clean_combined_df['Tumor Volume (mm3)'] == outlier1]
# outlier1

# cap_outlier_L = clean_combined_df[cap]
# cap_outlier_L[cap_outlier_L < lower_bound]

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
