## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

# Display the data table for preview


In [2]:

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Define DataFrames for both files
mouse_metadata_df = mouse_metadata
study_results_df = study_results

# Combine the data into a single dataset
# Merge data sets
merge_df = pd.merge(mouse_metadata, study_results, how = "outer", on ="Mouse ID")
merge_df.head()

# Total count of mice (raw number)
number_of_mice_unclean = len(merge_df['Mouse ID'])
number_of_mice_unclean

# Develop DataFrame to check the raw number of mice
number_of_mice_unclean_df = pd.DataFrame({
    "Unclean Number of Mice" : [number_of_mice_unclean]
})
number_of_mice_unclean_df

# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
# Find the unique mice id numbers that are duplicated
#df.loc[:,"Mouse ID"]
#Only when you are slicing rows | loc -> slicing by names  vs iloc -> slice by index  | df.iloc[,0] 
#df[0]
duplicate_ids = merge_df.loc[merge_df.duplicated(subset=["Mouse ID","Timepoint"]),"Mouse ID"].unique()
duplicate_ids

# Optional: Get all the data for the duplicate mouse ID
merge_df["Mouse ID"] == "g989"

# Get all the data for the duplicate mouse ID; view Mouse ID = g989
merge_df.loc[merge_df["Mouse ID"] == "g989",:]

# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse = merge_df[merge_df["Mouse ID"].isin(duplicate_ids)==False]
clean_mouse.head()

clean_mouse["Mouse ID"].nunique()

clean_mouse_df = clean_mouse
clean_mouse_df.head()


## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
# Find the most straighforward method in creating multiple series and summarizing in a table format.

# Create summary table for drug regimes and count
summary_statistics = pd.DataFrame(merge_df.groupby("Drug Regimen").count())

# Calculate the mean, median, variance, standard deviation and SEM; groupby Drug Regimen
summary_statistics['Mean'] = merge_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
summary_statistics['Median'] = merge_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
summary_statistics['Variance'] = merge_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
summary_statistics['Standard Deviation'] = merge_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
summary_statistics['SEM'] = merge_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_statistics = summary_statistics[["Mouse ID", "Mean", "Median", "Variance", "Standard Deviation", "SEM"]]

# Clean dataframe
summary_statistics['Mean'] = summary_statistics['Mean'].map("{:,.4f}".format)
summary_statistics['Median'] = summary_statistics['Median'].map("{:,.4f}".format)
summary_statistics['Variance'] = summary_statistics['Variance'].map("{:,.4f}".format)
summary_statistics['Standard Deviation'] = summary_statistics['Standard Deviation'].map("{:,.4f}".format)
summary_statistics['SEM'] = summary_statistics['SEM'].map("{:,.4f}".format)

summary_statistics




In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
# Generate a bar plot showing the number of mice per time point, for each treatment (using Pandas)
summarystats_df = pd.DataFrame(merge_df.groupby(["Drug Regimen"]).count()).reset_index()

# Isolate the columns in the the DataFrame
regimen_data = merge_df[["Drug Regimen","Mouse ID"]]

# Define data for bar plot so "Drug Regimen" is x-axis label
datapoints = summary_statistics[['Mouse ID']]
datapoints = datapoints.rename(columns = {"Mouse ID" : "Trial"})

# Define bar chart information
datapoints.plot(kind="bar", figsize=(10,6), color = "b", legend=True)

plt.title("Trials per Drug Regimen")
plt.ylabel("Trial Count")
plt.show()
plt.tight_layout()




In [10]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.

# Bar Plot with PyPlot
# Trials per Drug Regimen

# Generate a bar plot showing the number of mice per time point, for each treatment using PyPlot)
# Define DataFrame; based on each timepoint, show the total of mice for every treatment

# Define data for bar plot so "Drug Regimen" is x-axis label
datapoints = summary_statistics[["Mouse ID"]]
x_axis = np.arange(len(datapoints))
tick_locations = [value for value in x_axis]

# Define where to write x label and set chart size
plt.figure(figsize=(10,6))
plt.bar(x_axis, datapoints["Mouse ID"], color="b", width = .5)
plt.xticks(tick_locations, datapoints.index.values, rotation="vertical")

# Set x and y limits
plt.xlim(-.5, len(x_axis)-.5)
plt.ylim(0, max(datapoints["Mouse ID"])+10)
datapoints = datapoints.rename(columns = {"Mouse ID" : "Trial"})

# Set title and labels
plt.title("Trials per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Trial Count")

# Save and display graph
plt.show()


In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas


# Generate a pie plot showing the distribution of female versus male mice using pandas
gender = clean_mouse_df.groupby('Sex')
gender_qty = gender["Sex"].count()
# Create pie plot with data
plt.figure(figsize=(10,6))
ax1 = plt.subplot(121, aspect = 'equal')
gender_qty.plot(kind='pie', y = "Quantity", ax=ax1, autopct = '%1.1f%%',
              startangle = 186, shadow = False, legend = False, fontsize = 12)



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# Quartiles, Outliers and Boxplots
# Based on the information, the four most promising treatments include: Ramicane, Capomulin, Infubinol and Ceftamin
# Define main dataframe
clean_mouse_df.head()


## Pie Plot created with PyPlot
# Distribution of Male vs Female Mice

# Define DataFrame; based on mouse sex, show distribution of female versus male mice using Matplotlib's 'pyplot'
# Generate a pie plot showing the distribution of female versus male mice using pyplot 
gender = clean_mouse_df["Sex"].value_counts()

#Labels sections of the pie plot
labels = ["Male", "Female"]

# Define colors of each section of pie plot
colors = ["darkorange", "blue"]

# Define counts
counts = np.array(gender)

# Define each section
explode = (0.1, 0)

# Direct matplotlib to create a pie chart based upon the above data
plt.pie(gender, explode = explode, labels=labels, colors=colors, autopct="%1.1f%%", 
        shadow = True, startangle = 364)

# Create axes which are equal so we have a perfect circle
plt.axis("equal")
plt.title("Distribution of Male vs Female Mice")

# Display pie plot
plt.show()


## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
 Define DataFrames of Ramicane, Capomulin, Propriva and Ceftamin
capomulin2_df = clean_mouse_df.loc[clean_mouse_df["Drug Regimen"] == "Capomulin",:]
ramicane_df = clean_mouse_df.loc[clean_mouse_df["Drug Regimen"] == "Ramicane",:]
infubinol_df = clean_mouse_df.loc[clean_mouse_df["Drug Regimen"] == "Infubinol",:]
ceftamin_df = clean_mouse_df.loc[clean_mouse_df["Drug Regimen"] == "Ceftamin", :]

ramicane_merge = pd.merge(ramicane_vol, clean_mouse_df, on=("Mouse ID", "Timepoint"),how="left")
ramicane_merge.head()

# Ramicane - Calculate the final tumor volume with drug regimen Ramicane
tumor_ramicane = ramicane_merge["Tumor Volume (mm3)"]

# Define quartiles, IQR, lowerq and upperq
quartiles = tumor_ramicane.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


print(f"For Ramicane regimen, please note the following:")
print(f"The upper quartile of tumor volume is: {upperq}.")
print(f"The lower quartile of tumor volume is: {lowerq}.")
print(f"The interquartile range of tumor volme is: {iqr}.")
print(f"The median of tumor volume is: {quartiles[0.5]}.")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values above {upper_bound} could be outliers.") 
print(f"Values below {lower_bound} could be outliers.")



In [14]:
# Capomulin - Groupby Mouse ID and then use the max funcion to find the max timepoint for each mouse
capomulin_max = capomulin2_df.groupby('Mouse ID').max()['Timepoint']
capomulin_vol = pd.DataFrame(capomulin_max)

# Merge new data to dataframe
capomulin_merge = pd.merge(capomulin_vol, clean_mouse_df, on=("Mouse ID", "Timepoint"),how="left")
capomulin_merge.head()

# Calculate the final tumor volume
tumor_capomulin = capomulin_merge["Tumor Volume (mm3)"]

# Define quartiles, IQR, lowerq and upperq
quartiles = tumor_capomulin.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


print(f"For Capomulin regimen, please note the following:")
print(f"The upper quartile of tumor volume is: {upperq}.")
print(f"The lower quartile of tumor volume is: {lowerq}.")
print(f"The interquartile range of tumor volme is: {iqr}.")
print(f"The median of tumor volume is: {quartiles[0.5]}.")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values above {upper_bound} could be outliers.") 
print(f"Values below {lower_bound} could be outliers.")

# Infubinol - Groupby Mouse ID and then use the max funcion to find the max timepoint for each mouse
infubinol_max = infubinol_df.groupby('Mouse ID').max()['Timepoint']
infubinol_vol = pd.DataFrame(infubinol_max)

# Merge new data to dataframe
infubinol_merge = pd.merge(infubinol_vol, clean_mouse_df, on=("Mouse ID", "Timepoint"),how="left")
infubinol_merge.head()

# Infubinol - Calculate the final tumor volume with drug regimen Infubinol
tumor_infubinol = infubinol_merge["Tumor Volume (mm3)"]

# Define quartiles, IQR, lowerq and upperq
quartiles = tumor_infubinol.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"For Infubinol regimen, please note the following:")
print(f"The upper quartile of tumor volume is: {upperq}.")
print(f"The lower quartile of tumor volume is: {lowerq}.")
print(f"The interquartile range of tumor volme is: {iqr}.")
print(f"The median of tumor volume is: {quartiles[0.5]}.")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values above {upper_bound} could be outliers.") 
print(f"Values below {lower_bound} could be outliers.")

# Ceftamin - Groupby Mouse ID and then use the max funcion to find the max timepoint for each mouse
ceftamin_max = ceftamin_df.groupby('Mouse ID').max()['Timepoint']
ceftamin_vol = pd.DataFrame(ceftamin_max)

# Merge new data to dataframe
ceftamin_merge = pd.merge(ceftamin_vol, clean_mouse_df, on=("Mouse ID", "Timepoint"),how="left")
ceftamin_merge.head()

# Ceftamin - Calculate the tumor volume with drug regimen
tumor_ceftamin = ceftamin_merge["Tumor Volume (mm3)"]

# Define quartiles, IQR, lowerq and upperq
quartiles = tumor_ceftamin.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


print(f"For Ceftamin regimen, please note the following:")
print(f"The upper quartile of tumor volume is: {upperq}.")
print(f"The lower quartile of tumor volume is: {lowerq}.")
print(f"The interquartile range of tumor volme is: {iqr}.")
print(f"The median of tumor volume is: {quartiles[0.5]}.")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values above {upper_bound} could be outliers.") 
print(f"Values below {lower_bound} could be outliers.")


    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
# Generate a box plot of tumor volumes of each mouse of four promising drug regimens

data_to_plot = [tumor_ramicane, tumor_capomulin, tumor_infubinol, tumor_ceftamin]

fig1, ax1 = plt.subplots()
ax1.set_title('Tumor per Drug Regimen')
ax1.set_xlabel('Drug Regimen')
ax1.set_ylabel('Final Tumor Volume (mm3)')

outlier_dot = dict(markerfacecolor='red',markersize=12)

ax1.boxplot(data_to_plot, labels=["Ramicane","Capomulin","Infubinol","Ceftamin",],flierprops=outlier_dot)

plt.savefig('boxplot')
plt.show()



## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
## Line Plot of Time Point vs Tumor Volume 
#Mouse s185 treated with Capomulin

# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
# Isolate Capomulin Drug Regimen
capomulin_df = clean_mouse.loc[clean_mouse["Drug Regimen"] == "Capomulin"]
capomulin_df = capomulin_df.reset_index()
capomulin_df.head()

# Isolate data of a single mouse (s185)
# Remember loc -> slicing by names!
capomulin_s185_df = capomulin_df.loc[capomulin_df["Mouse ID"] == "s185"]
capomulin_s185_df.head()

# Isolate columns of single mouse (s185)
capomulin_s185_df = capomulin_s185_df.loc[:,["Timepoint", "Tumor Volume (mm3)"]]

# Set the index of capomulin_s185_df
capomulin_s185_df = capomulin_s185_df.reset_index(drop=True)

# Generate a line plot of time point versus tumor volume for a mouse (s185) treated with Capomulin
capomulin_s185_df.set_index('Timepoint').plot(figsize=(14,10), linewidth=2.5, color='blue')
plt.ylabel('Tumor Volume (mm3)')
plt.title("Time Point vs Tumor Volume of Mouse s185 with Capomulin Treatment")



In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
## Scatter Plot of Mouse Weight vs Average Tumor Volume 
#Drug Regimen is Capomulin
# Isolate Capomulin Drug Regimen data from above
capomulin_df.head()

# Isolate columns of Capomulin
capomulin_filtered_df = capomulin_df.loc[:,["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]]
capomulin_filtered_df.head()



## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
