## Dependencies and starter code

In [1]:
%matplotlib inline

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

In [3]:
# Combine the data into a single dataset
combined_results_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")

##rename columns for ease of use
combined_results_data = combined_results_data.rename(columns={"Drug Regimen":"Drug",
                                                              "Age_months":"Age (months)",
                                                              "Tumor Volume (mm3)":"Tumor Volume",})
combined_results_data.head()

Unnamed: 0,Mouse ID,Drug,Sex,Age (months),Weight (g),Timepoint,Tumor Volume,Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


## Summary statistics

In [4]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
#SEM of the tumor volume for each regimen

# Store the Mean Tumor Volume Data Grouped by Drug and Timepoint 
meanTumorVol = combined_results_data.groupby(["Drug","Timepoint"], as_index=False)["Tumor Volume"].mean()

# Convert to DataFrame
meanTumorVol_df = pd.DataFrame(data = meanTumorVol)
meanTumorVol_df = meanTumorVol_df.rename(columns={"Tumor Volume": "Mean Tumor Volume"})

# Preview DataFrame
meanTumorVol_df[0:12]

Unnamed: 0,Drug,Timepoint,Mean Tumor Volume
0,Capomulin,0,45.0
1,Capomulin,5,44.266086
2,Capomulin,10,43.084291
3,Capomulin,15,42.064317
4,Capomulin,20,40.716325
5,Capomulin,25,39.939528
6,Capomulin,30,38.769339
7,Capomulin,35,37.816839
8,Capomulin,40,36.958001
9,Capomulin,45,36.236114


In [11]:
# Store the median Tumor Volume Data Grouped by Drug and Timepoint 
medianTumorVol = combined_results_data.groupby(["Drug","Timepoint"], as_index=False)["Tumor Volume"].median()

# Convert to DataFrame
medianTumorVol_df = pd.DataFrame(data = medianTumorVol)
medianTumorVol_df = medianTumorVol_df.rename(columns={"Tumor Volume": "Median Tumor Volume"})

# Preview DataFrame
medianTumorVol_df[0:12]

Unnamed: 0,Drug,Timepoint,Median Tumor Volume
0,Capomulin,0,45.0
1,Capomulin,5,45.597064
2,Capomulin,10,43.421014
3,Capomulin,15,42.79816
4,Capomulin,20,40.716428
5,Capomulin,25,40.224165
6,Capomulin,30,39.260371
7,Capomulin,35,38.360455
8,Capomulin,40,36.843898
9,Capomulin,45,37.311846


In [5]:
# Store the variance Tumor Volume Data Grouped by Drug and Timepoint 
variTumorVol = combined_results_data.groupby(["Drug","Timepoint"], as_index=False)["Tumor Volume"].mean()

# Convert to DataFrame
variTumorVol_df = pd.DataFrame(data = variTumorVol)
variTumorVol_df = variTumorVol_df.rename(columns={"Tumor Volume": "variance Tumor Volume"})

# Preview DataFrame
variTumorVol_df[0:12]

Unnamed: 0,Drug,Timepoint,variance Tumor Volume
0,Capomulin,0,45.0
1,Capomulin,5,44.266086
2,Capomulin,10,43.084291
3,Capomulin,15,42.064317
4,Capomulin,20,40.716325
5,Capomulin,25,39.939528
6,Capomulin,30,38.769339
7,Capomulin,35,37.816839
8,Capomulin,40,36.958001
9,Capomulin,45,36.236114


In [8]:
semTumorVol = combined_results_data.groupby(["Drug","Timepoint"])["Tumor Volume"].sem().reset_index()

# Convert to DataFrame
semTumorVol_df = pd.DataFrame(data = semTumorVol)

# Preview DataFrame
semTumorVol_df.head()

Unnamed: 0,Drug,Timepoint,Tumor Volume
0,Capomulin,0,0.0
1,Capomulin,5,0.448593
2,Capomulin,10,0.702684
3,Capomulin,15,0.838617
4,Capomulin,20,0.909731


## Bar plots

In [12]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
#see 02 from lesson 2
# Filter the DataFrame down only to those columns to chart
drug_and_volume = combined_results_data[["Timepoint","Tumor Volume"]]

# Set the index to be "Drug" so they will be used as labels
drug_and_volume = drug_and_volume.set_index("Drug")

drug_and_volume.head()

KeyError: "None of ['Drug'] are in the columns"

In [10]:
# Use DataFrame.plot() in order to create a bar chart of the data
drug_and_volume.plot(kind="bar", figsize=(20,3))

# Set a title for the chart
plt.title("Tumor Volume by Drug Regimen")

plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
#see 07 from lesson 1 of matplotlib



## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
#see 09 from lesson 1 of matplotlib

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 
#Calculate the IQR and quantitatively determine if there are any potential outliers. 
#see lesson 3

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor 
#volume for the Capomulin regimen