In [1]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np
mouse_df = pd.read_csv('./Resources/Mouse_metadata.csv')
result_df = pd.read_csv('./Resources/Study_results.csv')

In [2]:
mouse_results = pd.merge(result_df, mouse_df, on ="Mouse ID")
mouse_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [3]:
unique1 = mouse_results.loc[mouse_results.duplicated(subset = ["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
unique1

array(['g989'], dtype=object)

In [4]:
dropped_dupl = mouse_results[mouse_results["Mouse ID"] != "g989"]
dropped_dupl

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [5]:
#Generate a summary statistics table consisting of the mean, median, variance, standard deviation, 
#and SEM of the tumor volume for each drug regimen.
drug_tumor = dropped_dupl[["Tumor Volume (mm3)", "Drug Regimen"]]
drug_df = drug_tumor.groupby(["Drug Regimen"])
mean_tumor = drug_df.mean()
mean_tumor = mean_tumor.rename(columns ={"Tumor Volume (mm3)": "Mean Tumor Volume (mm3)"})
mean_tumor

Unnamed: 0_level_0,Mean Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,40.675741
Ceftamin,52.591172
Infubinol,52.884795
Ketapril,55.235638
Naftisol,54.331565
Placebo,54.033581
Propriva,52.32093
Ramicane,40.216745
Stelasyn,54.233149
Zoniferol,53.236507


In [6]:
median_tumor = drug_df.median()
median_tumor = median_tumor.rename(columns ={"Tumor Volume (mm3)": "Median Tumor Volume (mm3)"})
median_tumor

Unnamed: 0_level_0,Median Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,41.557809
Ceftamin,51.776157
Infubinol,51.820584
Ketapril,53.698743
Naftisol,52.509285
Placebo,52.288934
Propriva,50.446266
Ramicane,40.673236
Stelasyn,52.431737
Zoniferol,51.818479


In [7]:
variance_tumor = drug_df.var()
variance_tumor = variance_tumor.rename(columns ={"Tumor Volume (mm3)": "Variance Tumor Volume (mm3)"})
variance_tumor

Unnamed: 0_level_0,Variance Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,24.947764
Ceftamin,39.290177
Infubinol,43.128684
Ketapril,68.553577
Naftisol,66.173479
Placebo,61.168083
Propriva,43.852013
Ramicane,23.486704
Stelasyn,59.450562
Zoniferol,48.533355


In [8]:
std_tumor = drug_df.std()
std_tumor = std_tumor.rename(columns ={"Tumor Volume (mm3)": "Standard Deviation Tumor Volume (mm3)"})
std_tumor

Unnamed: 0_level_0,Standard Deviation Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,4.994774
Ceftamin,6.268188
Infubinol,6.567243
Ketapril,8.279709
Naftisol,8.134708
Placebo,7.821003
Propriva,6.622085
Ramicane,4.846308
Stelasyn,7.710419
Zoniferol,6.966589


In [9]:
SEM_tumor = drug_df.sem()
SEM_tumor = SEM_tumor.rename(columns ={"Tumor Volume (mm3)": "SEM Tumor Volume (mm3)"})
SEM_tumor

Unnamed: 0_level_0,SEM Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,0.329346
Ceftamin,0.469821
Infubinol,0.492236
Ketapril,0.60386
Naftisol,0.596466
Placebo,0.581331
Propriva,0.544332
Ramicane,0.320955
Stelasyn,0.573111
Zoniferol,0.516398


In [10]:
merged1 = pd.merge(std_tumor, SEM_tumor, on="Drug Regimen")
merged2 = pd.merge(median_tumor, merged1, on="Drug Regimen")
merged3 = pd.merge(mean_tumor, merged2, on= "Drug Regimen")
merged3

Unnamed: 0_level_0,Mean Tumor Volume (mm3),Median Tumor Volume (mm3),Standard Deviation Tumor Volume (mm3),SEM Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,40.675741,41.557809,4.994774,0.329346
Ceftamin,52.591172,51.776157,6.268188,0.469821
Infubinol,52.884795,51.820584,6.567243,0.492236
Ketapril,55.235638,53.698743,8.279709,0.60386
Naftisol,54.331565,52.509285,8.134708,0.596466
Placebo,54.033581,52.288934,7.821003,0.581331
Propriva,52.32093,50.446266,6.622085,0.544332
Ramicane,40.216745,40.673236,4.846308,0.320955
Stelasyn,54.233149,52.431737,7.710419,0.573111
Zoniferol,53.236507,51.818479,6.966589,0.516398


In [43]:
#Generate a bar plot using both Pandas’s DataFrame.plot() and Matplotlib’s pyplot that shows the total number of measurements taken for each treatment regimen throughout the course of the study.
ID_group = dropped_dupl.groupby(["Drug Regimen"])
num_measurements = ID_group.count()
num_measurements = num_measurements.rename(columns = {"Timepoint": "Number of Measurements"})
num_measurements1 = num_measurements[["Number of Measurements"]]
measurement_chart = num_measurements1.plot(kind="bar", title = "Measurements for each speciment")

plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [35]:
num_measurements

Unnamed: 0_level_0,Mouse ID,Number of Measurements,Tumor Volume (mm3),Metastatic Sites,Sex,Age_months,Weight (g)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Capomulin,230,230,230,230,230,230,230
Ceftamin,178,178,178,178,178,178,178
Infubinol,178,178,178,178,178,178,178
Ketapril,188,188,188,188,188,188,188
Naftisol,186,186,186,186,186,186,186
Placebo,181,181,181,181,181,181,181
Propriva,148,148,148,148,148,148,148
Ramicane,228,228,228,228,228,228,228
Stelasyn,181,181,181,181,181,181,181
Zoniferol,182,182,182,182,182,182,182


In [47]:
num_measurements.reset_index(inplace=True)
num_measurements

Unnamed: 0,Drug Regimen,Mouse ID,Number of Measurements,Tumor Volume (mm3),Metastatic Sites,Sex,Age_months,Weight (g)
0,Capomulin,230,230,230,230,230,230,230
1,Ceftamin,178,178,178,178,178,178,178
2,Infubinol,178,178,178,178,178,178,178
3,Ketapril,188,188,188,188,188,188,188
4,Naftisol,186,186,186,186,186,186,186
5,Placebo,181,181,181,181,181,181,181
6,Propriva,148,148,148,148,148,148,148
7,Ramicane,228,228,228,228,228,228,228
8,Stelasyn,181,181,181,181,181,181,181
9,Zoniferol,182,182,182,182,182,182,182


In [49]:
%matplotlib notebook
drugs = num_measurements.iloc[:, 0]
num_mesur = num_measurements.iloc[:, 7]

plt.bar(drugs, num_mesur, color="b", align="center")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [None]:
#Generate a pie plot using both Pandas’s DataFrame.plot() and Matplotlib’s pyplot that shows the distribution of female or male mice in the study.
%matplotlib notebook
gender_mouse = mouse_results.groupby(["Sex", "Mouse ID"])
gender = gender_mouse.count()
gender_count = gender.groupby(["Sex"]) 
counted = gender_count.count()
counted = counted[["Timepoint"]]
counted = counted.rename(columns = {"Timepoint": "Number of Mice"})
counted.reset_index ()


In [None]:
counted_pie = counted.plot(kind = "pie", y="Number of Mice", autopct='%1.1f%%', title= "Number by Gender")
plt.show()

In [None]:
%matplotlib notebook
sex=["Female", "Male"]
number = counted.iloc[:, 0]
explode = [0, 0]
chart2 = plt.pie(number, explode=explode, labels=sex, autopct="%1.1f%%", shadow=False, startangle=0)
plt.legend(loc='upper right')
plt.show()

In [None]:
#Calculate the final tumor volume of each mouse across four of the most promising treatment regimens: Capomulin, Ramicane, Infubinol, and Ceftamin.
#Calculate the quartiles and IQR and quantitatively determine if there are any potential outliers across all four treatment regimens.

#creating array with all the mouses
num_measurements.reset_index(inplace = True)
mouses= np.asarray(num_measurements["Mouse ID"])

final_tumor =[]
# per_mouse = dropped_dupl.loc[dropped_dupl["Mouse ID"] == "b128", :]
# max_timepoint = per_mouse["Timepoint"].max()
# final = per_mouse.loc[per_mouse["Timepoint"] == max_timepoint, :]
# final_tumor.append(final["Tumor Volume (mm3)"].item())
# per_mouse = dropped_dupl.loc[dropped_dupl["Mouse ID"] == "m601", :]
# max_timepoint = per_mouse["Timepoint"].max()
# final = per_mouse.loc[per_mouse["Timepoint"] == max_timepoint, :]
# final_tumor.append(final["Tumor Volume (mm3)"].item())
# final_tumor

for i in mouses:
    per_mouse = dropped_dupl.loc[dropped_dupl["Mouse ID"] == "i", :]
    max_timepoint = per_mouse["Timepoint"].max()
    final = per_mouse.loc[per_mouse["Timepoint"] == max_timepoint, :]
    tumors = final["Tumor Volume (mm3)"].item()
    flag = not np.any(final_tumor)
    if flag:
        final_tumor = np.asarray(final["Mouse ID"])
    else:
        final_tumor.append(tumors)
    per_mouse = per_mouse.iloc[0:0]
    final = final.iloc[0:0]
    max_timepoint = 0
    


In [None]:
#Using Matplotlib, generate a box and whisker plot of the final tumor volume for all four treatment regimens and highlight any potential outliers in the plot by changing their color and style.

In [13]:
%matplotlib notebook
#Select a mouse that was treated with Capomulin and generate a line plot of tumor volume vs. time point for that mouse.
b128 = dropped_dupl.loc[dropped_dupl["Mouse ID"]=="b128", :] 
b128 = b128[["Timepoint", "Tumor Volume (mm3)"]]
b128.plot(kind= "line", x = "Timepoint", title = "Mouse b128's Tumor over Time")

<IPython.core.display.Javascript object>

<AxesSubplot:title={'center':"Mouse b128's Tumor over Time"}, xlabel='Timepoint'>

In [14]:
#Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin treatment regimen.
drugs = dropped_dupl.loc[dropped_dupl["Drug Regimen"] == "Capomulin"]
weight_tumor = drugs.groupby(["Mouse ID"])
mean_tumor = weight_tumor.mean()
mean_tumor = mean_tumor.rename(columns={"Weight (g)": "Average Weight (g)", "Tumor Volume (mm3)": "Average Tumor Volume (mm3)"})
mean_tumor.plot(kind="scatter", x="Average Weight (g)", y="Average Tumor Volume (mm3)", grid=True, figsize=(8,8),
              title="Average Weight verus Tumor Volume")



<IPython.core.display.Javascript object>

<AxesSubplot:title={'center':'Average Weight verus Tumor Volume'}, xlabel='Average Weight (g)', ylabel='Average Tumor Volume (mm3)'>

In [15]:
#Calculate the correlation coefficient and linear regression model between mouse weight and average tumor volume for the Capomulin treatment. Plot the linear regression model on top of the previous scatter plot.
weight_a = mean_tumor.iloc[:, 4]
tumor_a = mean_tumor.iloc[:, 1]
correlation = st.pearsonr(weight_a, tumor_a)
print(f"The correlation between weight and tumor size is {round(correlation[0],2)}")

x_values = mean_tumor['Average Weight (g)']
y_values = mean_tumor['Average Tumor Volume (mm3)']
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.show

The correlation between weight and tumor size is 0.84


<function matplotlib.pyplot.show(block=None)>

In [None]:
#Look across all previously generated figures and tables and write at least three observations or inferences that can be made from the data. Include these observations at the top of notebook.