## Observations and Insights 

In [1]:
 %matplotlib notebook

In [67]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combine_results_df = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')

# Display the data table for preview
combine_results_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [3]:
# Checking the number of mice.
combine_results_df["Mouse ID"].nunique()

249

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice_df = combine_results_df[combine_results_df[["Mouse ID", "Timepoint"]].duplicated() == True]

duplicate_mice_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [5]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_df = combine_results_df[combine_results_df[["Mouse ID", "Timepoint"]].duplicated(keep="last") == True]

duplicate_mice_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
912,g989,Propriva,Female,21,26,10,51.745156,0
914,g989,Propriva,Female,21,26,15,51.325852,1
916,g989,Propriva,Female,21,26,20,55.326122,1


In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_mouse_df = combine_results_df[combine_results_df["Mouse ID"] != 'g989']

In [7]:
# Checking the number of mice in the clean DataFrame.
cleaned_mouse_df["Mouse ID"].nunique()

248

## Summary Statistics

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function
summary_stats_df = cleaned_mouse_df.groupby("Drug Regimen")

mean_tv = summary_stats_df["Tumor Volume (mm3)"].mean()

median_tv = summary_stats_df["Tumor Volume (mm3)"].median()

var_tv = summary_stats_df["Tumor Volume (mm3)"].var()

std_tv = summary_stats_df["Tumor Volume (mm3)"].std()

sem_tv = summary_stats_df["Tumor Volume (mm3)"].sem()

summary_stats_df = pd.DataFrame({"Mean Tumor Volume": mean_tv,
                                "Median Tumor Volume": median_tv,
                                "Variance in Tumor Volume": var_tv,
                                "Standard Deviation of Tumor Volume": std_tv,
                                "SEM of Tumor Volume":sem_tv})

summary_stats_df["Mean Tumor Volume"] = summary_stats_df["Mean Tumor Volume"].map("{0:,.2f}mm3".format)
summary_stats_df["Median Tumor Volume"] = summary_stats_df["Median Tumor Volume"].map("{0:,.2f}mm3".format)
summary_stats_df["Variance in Tumor Volume"] = summary_stats_df["Variance in Tumor Volume"].map("{0:,.2f}mm3".format)
summary_stats_df["Standard Deviation of Tumor Volume"] = summary_stats_df["Standard Deviation of Tumor Volume"].map("{0:,.2f}mm3".format)
summary_stats_df["SEM of Tumor Volume"] = summary_stats_df["SEM of Tumor Volume"].map("{0:,.2f}mm3".format)

summary_stats_df

Unnamed: 0_level_0,Mean Tumor Volume,Median Tumor Volume,Variance in Tumor Volume,Standard Deviation of Tumor Volume,SEM of Tumor Volume
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68mm3,41.56mm3,24.95mm3,4.99mm3,0.33mm3
Ceftamin,52.59mm3,51.78mm3,39.29mm3,6.27mm3,0.47mm3
Infubinol,52.88mm3,51.82mm3,43.13mm3,6.57mm3,0.49mm3
Ketapril,55.24mm3,53.70mm3,68.55mm3,8.28mm3,0.60mm3
Naftisol,54.33mm3,52.51mm3,66.17mm3,8.13mm3,0.60mm3
Placebo,54.03mm3,52.29mm3,61.17mm3,7.82mm3,0.58mm3
Propriva,52.32mm3,50.45mm3,43.85mm3,6.62mm3,0.54mm3
Ramicane,40.22mm3,40.67mm3,23.49mm3,4.85mm3,0.32mm3
Stelasyn,54.23mm3,52.43mm3,59.45mm3,7.71mm3,0.57mm3
Zoniferol,53.24mm3,51.82mm3,48.53mm3,6.97mm3,0.52mm3


## Bar and Pie Charts

In [34]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

drug_data = cleaned_mouse_df["Drug Regimen"].value_counts()

mouse_plot = drug_data.plot(kind="bar")

for index,value in enumerate(drug_data):
    mouse_plot.annotate(value,(index, value), xytext=(0,2),textcoords='offset points')


plt.title("Data Points per Drug Regimen")
plt.ylabel("Data Points")
plt.xlabel("Drug Regimen")

plt.tight_layout()

<IPython.core.display.Javascript object>

In [65]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

drug_data = cleaned_mouse_df.sort_values("Drug Regimen")["Drug Regimen"].value_counts()

drug_names = cleaned_mouse_df.sort_values("Drug Regimen")["Drug Regimen"].unique()

#mouse_plot = plt.bar(list(drug_data.index), drug_data, align="center")


#plt.xticks(rotation=90)
#plt.title("Data Points per Drug Regimen")
#plt.ylabel("Data Points")
#plt.xlabel("Drug Regimen")

#plt.tight_layout()

#plt.show()
index= np.arange(len(drug_data.index))

width= 0.1
fig, ax = plt.subplots()
drugs1 = ax.bar(index - width/2, drug_data)

ax.set_ylabel("Data Points")
ax.set_xlabel("Drug Regimen")
ax.set_title("Data Points per Drug Regimen")
ax.set_xticks(index)
plt.xticks(rotation=90)
ax.set_xticklabels(drug_data.index)

def autolabel(drugs, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0, 'right': 1, 'left': -1}

    for drug in drugs:
        height = drug.get_height()
        ax.annotate('{}'.format(height),
                    xy=(drug.get_x() + drug.get_width() / 2, height),
                    xytext=(offset[xpos]*3, 3),  # use 3 points offset
                    textcoords="offset points",  # in both directions
                    ha=ha[xpos], va='bottom')


autolabel(drugs1)


plt.tight_layout()

<IPython.core.display.Javascript object>

In [32]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
male_mouse = cleaned_mouse_df.loc[cleaned_mouse_df["Sex"]=="Male", "Mouse ID"].nunique()
female_mouse = cleaned_mouse_df.loc[cleaned_mouse_df["Sex"]=="Female", "Mouse ID"].nunique()

sex_dist_df = pd.DataFrame ({"Female vs Male": [female_mouse, male_mouse]},
                           index=["Female","Male"])

colors = ["red", "blue"]
plot = sex_dist_df.plot.pie(y="Female vs Male", figsize=(5,5), colors=colors, autopct="%1.1f%%")

plt.title("Sex Distribution Chart")


<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Sex Distribution Chart')

In [33]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex_dist = [cleaned_mouse_df.loc[cleaned_mouse_df["Sex"]=="Male", "Mouse ID"].nunique(), cleaned_mouse_df.loc[cleaned_mouse_df["Sex"]=="Female", "Mouse ID"].nunique()]
sex = ["Male", "Female"]
colors = ["blue", "red"]

plt.pie(sex_dist, labels=sex, colors=colors, autopct="%1.1f%%")

plt.title("Sex Distribution Chart")


<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Sex Distribution Chart')

## Quartiles, Outliers and Boxplots

In [78]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
time_df = cleaned_mouse_df.loc[(cleaned_mouse_df["Timepoint"].max())]

time_df = time_df.set_index("Timepoint")
                               
time_df.head()                               
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


AttributeError: 'Series' object has no attribute 'set_index'

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
