# Pymaceuticals Inc.
---

### Analysis
* Your analysis here ...

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import random

from scipy.stats import linregress

# set seed
random.seed(69)

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(study_results, mouse_metadata, on = "Mouse ID",
                      how="left")

# Display the data table for preview
combined_df.head()

In [None]:
# Checking the number of mice.
len(combined_df["Mouse ID"].unique())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duped = combined_df[combined_df.duplicated()]
duped[["Mouse ID", "Timepoint"]]

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duped_info = duped.iloc[0]
duped_id = duped_info["Mouse ID"]

all_duped_id = combined_df.loc[combined_df["Mouse ID"] == duped_id] 
all_duped_id

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
all_duped_index = all_duped_id.index
clean_df = combined_df.drop(index = all_duped_index)
clean_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_df["Mouse ID"].unique())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

# Column definition
tumor_vol_col = ["Tumor Volume (mm3)"]

# Groupby definition
drug_grouped_df = clean_df.groupby(clean_df["Drug Regimen"])

# Keeping stuff DRY
tumor_vol = drug_grouped_df[tumor_vol_col]

# Stats definitions
mean_tumor_vol = tumor_vol.mean()
med_tumor_vol = tumor_vol.median()
var_tumor_vol = tumor_vol.var()
std_tumor_vol = tumor_vol.std()
sem_tumor_vol = tumor_vol.sem()

# drug_tumor summary df
drug_tumor_df = pd.DataFrame({
    "Mean Tumor Volume": mean_tumor_vol["Tumor Volume (mm3)"],
    "Median Tumor Volume": med_tumor_vol["Tumor Volume (mm3)"],
    "Tumor Volume Variance": var_tumor_vol["Tumor Volume (mm3)"],
    "Tumor Volume Std. Dev.": std_tumor_vol["Tumor Volume (mm3)"],
    "Tumor Volume Std. Err.": sem_tumor_vol["Tumor Volume (mm3)"]
})

drug_tumor_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
drug_agg_df = drug_grouped_df.agg(['mean', 'median', 'var', 'std', 'sem'])
drug_tumor_agg_df = drug_agg_df["Tumor Volume (mm3)"]
drug_tumor_agg_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
# drug_grouped_df["Mouse ID"].value_counts()
drug_trial_count = clean_df["Drug Regimen"].value_counts()

drug_trial_count.plot(kind = "bar")

plt.ylabel("Number of Unique Mice Tested")
plt.xlabel("Drug Regimen")

plt.show()

In [None]:
# Generate a bar plot showing the total number of unqiue mice tested on each drug regimen using using pyplot.
drug_trial_count = clean_df["Drug Regimen"].value_counts()
x_axis = np.arange(len(drug_grouped_df))

plt.bar(x_axis, drug_trial_count)

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drug_tumor_agg_df.index, rotation = 'vertical')

plt.ylabel("Number of Unique Mice Tested")
plt.xlabel("Drug Regimen")

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_count = clean_df["Sex"].value_counts()

sex_count.plot(kind = "pie", autopct = '%1.1f%%')

plt.ylabel("Sex")

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex_count = clean_df["Sex"].value_counts()

plt.pie(sex_count, labels = clean_df["Sex"].unique(), autopct = "%1.1f%%")

plt.ylabel("Sex")

plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
desc_timepoint_df = clean_df.sort_values(by=['Timepoint'], ascending=False)

grouped_ID_df = clean_df.groupby(by=["Mouse ID"])
max_time_df = grouped_ID_df["Timepoint"].max()

# max_time_df.head()
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

merge_df = pd.merge(clean_df, max_time_df, on = ("Mouse ID", "Timepoint"), how = "right")
merge_df.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_list = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in treatment_list:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    drug_max_time_df = merge_df.loc[merge_df["Drug Regimen"] == drug]
    drug_final_time_df = drug_max_time_df.loc[drug_max_time_df["Timepoint"]==drug_max_time_df["Timepoint"]]
    drug_final_tumor_vol_df = drug_max_time_df["Tumor Volume (mm3)"]

    # add subset 
    tumor_vol_list.append(drug_final_tumor_vol_df)
    
    # Determine outliers using upper and lower bounds   
    quartiles = drug_final_tumor_vol_df.quantile([.25, .5, .75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq
    upperbound = upperq + (1.5*iqr)
    lowerbound = lowerq - (1.5*iqr)
    outliers = (drug_final_tumor_vol_df.loc[(drug_final_time_df["Tumor Volume (mm3)"] >= upperbound) | 
                                           (drug_final_time_df["Tumor Volume (mm3)"] <= lowerbound)])
    # number of {drug} outliers
    print(f"{drug}'s potential outliers: {outliers}")
    print(f"")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

# flierprops to make the outlier marker red in color
flierprops = dict(marker = 'o', markerfacecolor = 'r', markersize = 12)

# Plot
fig1, ax1 = plt.subplots()
ax1.boxplot(tumor_vol_list, flierprops = flierprops)

ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.set_xticklabels(treatment_list)

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_only = clean_df.loc[clean_df["Drug Regimen"] =="Capomulin"]

# Randomly select a mouse from capomulin pool
random_mouse = capomulin_only.sample(1, random_state=1)["Mouse ID"]
random_mouse_id = random_mouse.iloc[0]

# Defining relevent columns
selected_mouse = capomulin_only.loc[capomulin_only["Mouse ID"] == random_mouse_id]
mouse_time = selected_mouse["Timepoint"]
mouse_tum_vol = selected_mouse["Tumor Volume (mm3)"]

# Plot
u364_treatment = plt.plot(mouse_time, mouse_tum_vol)
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title(f"Capomulin treatment of mouse {random_mouse_id}")

plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

# defining variables for plot
cap_grouped_mice = capomulin_only.groupby(capomulin_only["Mouse ID"])
cap_mean_mice_weight = cap_grouped_mice["Weight (g)"].mean()
cap_mean_tum_vol = cap_grouped_mice["Tumor Volume (mm3)"].mean()

# plot
plt.scatter(cap_mean_mice_weight, cap_mean_tum_vol)
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# defining variables for regression
x = cap_mean_mice_weight
y = cap_mean_tum_vol

(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)

regress_values = x * slope + intercept

# printing the r value
print(f"The correlation between mouse weight(g) and the average tumor volume(mm3) is {round(rvalue, 2)}.")

# plot
plt.scatter(cap_mean_mice_weight, cap_mean_tum_vol)

plt.plot(x, regress_values, "r-")

plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

plt.show()