In [1]:
%matplotlib notebook

In [2]:
# Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

In [3]:
# Files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

In [4]:
# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_metadata.head()
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [5]:
# Combine the data into a single dataset
pymaceuticals_data = pd.merge(mouse_metadata, study_results, on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
pymaceuticals_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [6]:
 # Checking the number of mice.
number_of_mice = len(pymaceuticals_data["Mouse ID"])
number_of_mice

1893

In [7]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = pymaceuticals_data.duplicated(subset=['Mouse ID', 'Timepoint'])
duplicate_mice

0       False
1       False
2       False
3       False
4       False
        ...  
1888    False
1889    False
1890    False
1891    False
1892    False
Length: 1893, dtype: bool

In [8]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
pymaceuticals_clean = pymaceuticals_data.drop_duplicates(subset=['Mouse ID', 'Timepoint'],keep='first',inplace=False)
pymaceuticals_clean

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [9]:
 # Checking the number of mice in the clean DataFrame.
total_mice = len(pymaceuticals_clean["Mouse ID"])
total_mice

1888

# Summary Statistics

In [10]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.
grouped = pymaceuticals_clean.groupby('Drug Regimen')
summary_table = round(grouped.agg(['mean','median','var','std','sem'])["Tumor Volume (mm3)"], 2)
summary_table

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,24.95,4.99,0.33
Ceftamin,52.59,51.78,39.29,6.27,0.47
Infubinol,52.88,51.82,43.13,6.57,0.49
Ketapril,55.24,53.7,68.55,8.28,0.6
Naftisol,54.33,52.51,66.17,8.13,0.6
Placebo,54.03,52.29,61.17,7.82,0.58
Propriva,52.39,50.91,43.14,6.57,0.53
Ramicane,40.22,40.67,23.49,4.85,0.32
Stelasyn,54.23,52.43,59.45,7.71,0.57
Zoniferol,53.24,51.82,48.53,6.97,0.52


# Bar & Pie Charts

In [11]:
regimen = pymaceuticals_clean.groupby(['Drug Regimen']).count()['Mouse ID']
regimen

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     156
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Mouse ID, dtype: int64

In [12]:
x_axis = np.arange(len(regimen)) 

In [13]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.
# tick locations horizontal?
plt.bar(x_axis, regimen, color='pink', alpha=1, align='center')

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol'], rotation='vertical')


plt.title("Total Mice Treated")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

plt.tight_layout()
plt.savefig("Total Mice Bar Plot.png")
plt.show()

<IPython.core.display.Javascript object>

In [14]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender = pymaceuticals_clean.groupby(["Mouse ID","Sex"])
gender

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb26765e9a0>

In [15]:
gender_df = pd.DataFrame(gender.size())
gender_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Mouse ID,Sex,Unnamed: 2_level_1
a203,Female,10
a251,Female,10
a262,Female,10
a275,Female,10
a366,Female,7
...,...,...
z435,Female,3
z578,Male,10
z581,Female,10
z795,Female,10


In [16]:
gender_data = pd.DataFrame(gender_df.groupby(["Sex"]).count())
gender_data.columns = ["Total Count"]
gender_data

Unnamed: 0_level_0,Total Count
Sex,Unnamed: 1_level_1
Female,124
Male,125


In [17]:
gender_data["Gender by Percentage"] = round(100*(gender_data["Total Count"]/gender_data["Total Count"].sum()), 2)
gender_data

Unnamed: 0_level_0,Total Count,Gender by Percentage
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,124,49.8
Male,125,50.2


In [18]:
labels = ["Female", "Male"]
colors = ["pink", "lightblue"]
explode = (0.1, 0)

In [19]:
plot = gender_data.plot.pie(y='Total Count', explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)
plt.show()
plt.savefig("Mice Male vs. Female Piechart.png")

<IPython.core.display.Javascript object>

# Pie Plot With Python

In [20]:
gender_dataframe = pd.DataFrame(pymaceuticals_clean.groupby(["Sex"]).count()).reset_index()
gender_dataframe.head()

Unnamed: 0,Sex,Mouse ID,Drug Regimen,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,Female,930,930,930,930,930,930,930
1,Male,958,958,958,958,958,958,958


In [21]:
gender_dataframe = gender_dataframe[["Sex","Mouse ID"]]
gender_dataframe = gender_dataframe.rename(columns={"Mouse ID": "Total Count"})
gender_dataframe.head()

Unnamed: 0,Sex,Total Count
0,Female,930
1,Male,958


In [22]:
plt.figure(figsize=(10,6))
ax1 = plt.subplot(121, aspect='equal')
gender_dataframe.plot(kind='pie', y = "Total Count", ax=ax1, autopct='%1.1f%%', 
startangle=90, shadow=False, labels=gender_dataframe['Sex'], legend = False)

<IPython.core.display.Javascript object>

<AxesSubplot:ylabel='Total Count'>

#  Quartiles, Outliers and Boxplots

In [23]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
regimens_df = pymaceuticals_clean.loc[(pymaceuticals_clean["Drug Regimen"] == "Capomulin") | (pymaceuticals_clean["Drug Regimen"] == "Ramicane") | 
                                      (pymaceuticals_clean["Drug Regimen"] == "Infubinol") | (pymaceuticals_clean["Drug Regimen"] == "Ceftamin")]

# Start by getting the last (greatest) timepoint for each mouse
regimens_df = regimens_df.groupby('Mouse ID').max()['Timepoint']
regimens_df.head()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
#Come back and round
tumor_df = pd.merge(pymaceuticals_data, regimens_df, on=("Mouse ID","Timepoint"),how="right")
tumor_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,45,67.973419,2
1,a251,Infubinol,Female,21,25,45,65.525743,1
2,a275,Ceftamin,Female,20,28,45,62.999356,3
3,a411,Ramicane,Male,3,22,45,38.407618,1
4,a444,Ramicane,Female,10,25,45,43.047543,0
...,...,...,...,...,...,...,...,...
95,y769,Ceftamin,Female,6,27,45,68.594745,4
96,y793,Capomulin,Male,17,17,45,31.896238,2
97,y865,Ceftamin,Male,23,26,45,64.729837,3
98,z578,Ramicane,Male,11,16,45,30.638696,0


In [24]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# Locate the rows which contain mice on each drug and get the tumor volumes 
    # add subset 
    # Determine outliers using upper and lower bounds
    
    
for drug in treatments:
    tumor_vol = tumor_df.loc[tumor_df['Drug Regimen']==drug, 'Tumor Volume (mm3)']
    tumor_vol_data.append(tumor_vol)
    print(drug)
    quantiles = tumor_vol.quantile([.25, .5, .75])
    lowerq = quantiles[0.25]
    upperq = quantiles[0.75]
    iqr = upperq - lowerq
    
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    
    print(f"The lower quartile of is: {lowerq}")
    print(f"The upper quartile of is: {upperq}")
    print(f"The interquartile range is: {iqr}")
    print(f"The the median is: {quantiles[0.5]}")
    
    print(f"Values below {lower_bound} could be outliers.")
    print(f"Values above {upper_bound} could be outliers.")
    
    print(f"Outliers are: {tumor_vol.loc[(tumor_vol<lower_bound) | (tumor_vol>upper_bound)]}")
    print()
    print()
    print()

Capomulin
The lower quartile of is: 32.37735684
The upper quartile of is: 40.1592203
The interquartile range is: 7.781863460000004
The the median is: 38.125164399999996
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.
Outliers are: Series([], Name: Tumor Volume (mm3), dtype: float64)



Ramicane
The lower quartile of is: 31.56046955
The upper quartile of is: 40.65900627
The interquartile range is: 9.098536719999998
The the median is: 36.56165229
Values below 17.912664470000003 could be outliers.
Values above 54.30681135 could be outliers.
Outliers are: Series([], Name: Tumor Volume (mm3), dtype: float64)



Infubinol
The lower quartile of is: 54.04860769
The upper quartile of is: 65.52574285
The interquartile range is: 11.477135160000003
The the median is: 60.16518046
Values below 36.83290494999999 could be outliers.
Values above 82.74144559000001 could be outliers.
Outliers are: 15    36.321346
Name: Tumor Volume (mm3), dtype: float64



Ce

In [25]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume of Each Mouse')
ax1.set_ylabel('Tumor Vol')
ax1.boxplot(tumor_vol_data, labels=treatments)
plt.show()
plt.savefig("Final Tumor Volume of Each Mouse - Box Plots.png")

<IPython.core.display.Javascript object>

# Line and Scatter Plots

In [26]:
 # Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
    # Locate Capomulin and Mouse ID and put into new DF
vol_data = tumor_df.loc[tumor_df["Drug Regimen"] == "Capomulin"]
vol_data = vol_data.reset_index()

mouse_df = vol_data.loc[vol_data["Mouse ID"] == "g288"]
mouse_df

Unnamed: 0,index,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
3,25,g288,Capomulin,Male,3,19,45,37.074024,1


In [30]:
#Locate the time point and tumor volume
mouse_df = mouse_df.loc[:, ["Timepoint", "Tumor Volume (mm3)"]]
mouse_df = mouse_df.reset_index(drop=True)

# Generate line plot
mouse_df.set_index('Timepoint').plot(figsize=(10, 8), linewidth=3, color='blue')
plt.savefig("Time Point VS. Tumor Volume - Line Plot.png")

<IPython.core.display.Javascript object>

In [37]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
vol_data_reduced = vol_data.loc[:, ["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]]
vol_data_reduced

# Calculate average tumor volume for capomulin
avg_tumor_cap = pd.DataFrame(vol_data_reduced.groupby(["Mouse ID", "Weight (g)"])["Tumor Volume (mm3)"].mean()).reset_index()
avg_tumor_cap = avg_tumor_cap.rename(columns={"Tumor Volume (mm3)": "Average Volume"})
avg_tumor_cap = avg_tumor_cap.set_index('Mouse ID')
avg_tumor_cap.head()

Unnamed: 0_level_0,Weight (g),Average Volume
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1
b128,22,38.982878
b742,21,38.939633
f966,17,30.485985
g288,19,37.074024
g316,22,40.15922


In [39]:
# Plot the scatter plot
avg_tumor_cap.plot(kind="scatter", x="Weight (g)", y="Average Volume",figsize=(4,4),
              title="Mouse Weight VS. Average Tumor Volume (Capomulin Regimen)", marker="o")
plt.show()
plt.savefig("Mouse Weight VS. Average Tumor Volume (Capomulin Regimen) - Scatter Plot.png")

<IPython.core.display.Javascript object>

 # Correlation and Regression

In [46]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
weight = avg_tumor_cap.iloc[:,0]
tumor_vol = avg_tumor_cap.iloc[:,1]
correlation = st.pearsonr(weight,tumor_vol)
print(f"The Capomulin regimen has a strong positive correlation coefficient of: {correlation[0]}")

The Capomulin regimen has a strong positive correlation coefficient of: 0.8767060403658116


In [51]:
x_values = avg_tumor_cap['Weight (g)']
y_values = avg_tumor_cap['Average Volume']
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Mouse Weight')
plt.ylabel('Average Tumor Volume')
plt.show()