In [1]:
%matplotlib notebook

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

In [3]:
mouse_metadata ="mouse_metadata.csv"
study_results ="study_results.csv"

mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

In [4]:
data_df = pd.merge(study_results, mouse_metadata, how = "left", on="Mouse ID")
data_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [5]:
mean = data_df.groupby('Drug Regimen').mean()["Tumor Volume (mm3)"]
median = data_df.groupby('Drug Regimen').median()["Tumor Volume (mm3)"]
variance = data_df.groupby('Drug Regimen').var()["Tumor Volume (mm3)"]
standard_deviation = data_df.groupby('Drug Regimen').std()["Tumor Volume (mm3)"]
sem = data_df.groupby('Drug Regimen').sem()["Tumor Volume (mm3)"]

data = { 
        'Mean': mean,
        'Median': median,
        'Variance': variance,
        'Standard Deviation': standard_deviation,
        'SEM': sem,
}  
                                          
                                          
summary_table = pd.DataFrame(data)

summary_table        
                                          

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.322552,50.854632,42.35107,6.50777,0.512884
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [6]:
grouped_df = pd.DataFrame(data_df.groupby(["Drug Regimen"]).count()).reset_index()
regimen_datapoints = grouped_df [["Drug Regimen", "Mouse ID"]]
regimen_datapoints = regimen_datapoints.rename(columns={"Mouse ID":"Count"})
regimen_datapoints = regimen_datapoints.set_index("Drug Regimen")

In [7]:
regimen_datapoints.plot(kind="bar", figsize=(5,3))

plt.title("Count Per Drug Regimen")
plt.ylabel("Count")
plt.legend(loc="best")

plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [8]:
drug_regimens = summary_table.index.tolist()
drug_regimens

['Capomulin',
 'Ceftamin',
 'Infubinol',
 'Ketapril',
 'Naftisol',
 'Placebo',
 'Propriva',
 'Ramicane',
 'Stelasyn',
 'Zoniferol']

In [9]:
regimen_count = (data_df.groupby(["Drug Regimen"])["Age_months"].count()).tolist()
regimen_count

[230, 178, 178, 188, 186, 181, 161, 228, 181, 182]

In [10]:
x_axis = np.arange(len(regimen_count))

In [11]:
x_axis = drug_regimens

plt.figure(figsize = (10,3))
plt.bar(x_axis, regimen_count, color ='b', alpha=0.5, align="edge")
plt.tight_layout()

<IPython.core.display.Javascript object>

In [12]:
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drug_regimens)

([<matplotlib.axis.XTick at 0x20ea55f2a30>,
  <matplotlib.axis.XTick at 0x20ea55f2a00>,
  <matplotlib.axis.XTick at 0x20ea5628ca0>,
  <matplotlib.axis.XTick at 0x20ea56371c0>,
  <matplotlib.axis.XTick at 0x20ea56376d0>,
  <matplotlib.axis.XTick at 0x20ea5637be0>,
  <matplotlib.axis.XTick at 0x20ea5637850>,
  <matplotlib.axis.XTick at 0x20ea563d130>,
  <matplotlib.axis.XTick at 0x20ea563d640>,
  <matplotlib.axis.XTick at 0x20ea563db50>],
 [Text(0.0, 0, 'Capomulin'),
  Text(1.0, 0, 'Ceftamin'),
  Text(2.0, 0, 'Infubinol'),
  Text(3.0, 0, 'Ketapril'),
  Text(4.0, 0, 'Naftisol'),
  Text(5.0, 0, 'Placebo'),
  Text(6.0, 0, 'Propriva'),
  Text(7.0, 0, 'Ramicane'),
  Text(8.0, 0, 'Stelasyn'),
  Text(9.0, 0, 'Zoniferol')])

In [13]:
gender_df = pd.DataFrame(data_df.groupby(["Sex"]).count()).reset_index()
gender_df.head()

Unnamed: 0,Sex,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Age_months,Weight (g)
0,Female,935,935,935,935,935,935,935
1,Male,958,958,958,958,958,958,958


In [14]:
gender_df = gender_df[["Sex", "Mouse ID"]]
gender_df = gender_df.rename(columns={"Mouse ID":"Count"})
gender_df.head()

Unnamed: 0,Sex,Count
0,Female,935
1,Male,958


In [15]:
plt.figure(figsize=(8,4))
ax1 = plt.subplot(121, aspect='equal')
gender_df.plot(kind='pie', y = "Count", ax=ax1, autopct='%1.1f%%',startangle=90, shadow=False, labels=gender_df['Sex'], legend=False, fontsize=16)

<IPython.core.display.Javascript object>

<AxesSubplot:ylabel='Count'>

In [16]:
gender_count = (data_df.groupby(["Sex"])["Age_months"].count()).tolist()
gender_count

[935, 958]

In [17]:
labels =["Females", "Males"]
colors = ["Pink", "blue"]
explode = (0.1, 0)

In [18]:
plt.pie(gender_count,explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle =140)

# plt.savefig("../Images/PyPies.png")
# plt.show()

([<matplotlib.patches.Wedge at 0x20ea56eb040>,
  <matplotlib.patches.Wedge at 0x20ea56eba00>],
 [Text(-0.7887477729166414, -0.904365496201087, 'Females'),
  Text(0.7230187918402547, 0.8290017048509963, 'Males')],
 [Text(-0.46010286753470747, -0.527546539450634, '49.4%'),
  Text(0.3943738864583208, 0.4521827481005434, '50.6%')])

In [19]:
plt.axis("equal")

(-1.1874943525713364,
 1.1105268056883157,
 -1.1938032889323704,
 1.112116577938631)

In [20]:
data_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [21]:
new_df = data_df.drop_duplicates()


In [22]:
sort_df = new_df.sort_values(["Drug Regimen", "Mouse ID", "Timepoint"], ascending=True)

max_df = sort_df.loc[sort_df["Timepoint"] == 45]
max_df.head().reset_index()

Unnamed: 0,index,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,1764,b128,45,38.982878,2,Capomulin,Female,9,22
1,1829,b742,45,38.939633,0,Capomulin,Male,7,21
2,1777,g288,45,37.074024,1,Capomulin,Male,3,19
3,1791,g316,45,40.15922,2,Capomulin,Female,22,22
4,1878,i557,45,47.685963,1,Capomulin,Female,1,24


In [23]:
cap_data_df = max_df[max_df["Drug Regimen"].isin(["Capomulin"])]
cap_data_df.head().reset_index()

Unnamed: 0,index,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,1764,b128,45,38.982878,2,Capomulin,Female,9,22
1,1829,b742,45,38.939633,0,Capomulin,Male,7,21
2,1777,g288,45,37.074024,1,Capomulin,Male,3,19
3,1791,g316,45,40.15922,2,Capomulin,Female,22,22
4,1878,i557,45,47.685963,1,Capomulin,Female,1,24


In [24]:
cap_obj = cap_data_df.drop_duplicates()

In [25]:
cap_obj = cap_data_df.sort_values(["Tumor Volume (mm3)"],ascending=True).reset_index()
cap_obj = cap_obj["Tumor Volume (mm3)"]
cap_obj

0     23.343598
1     28.430964
2     28.484033
3     31.023923
4     31.896238
5     32.377357
6     33.329098
7     34.455298
8     36.041047
9     37.074024
10    37.311846
11    38.125164
12    38.846876
13    38.939633
14    38.982878
15    40.159220
16    40.658124
17    40.728578
18    41.483008
19    41.581521
20    47.685963
Name: Tumor Volume (mm3), dtype: float64

In [26]:
quartiles = cap_obj.quantile([.25,.5,.75])
upperq = quartiles[0.75]
lowerq = quartiles[0.25]
iqr = upperq - lowerq

In [27]:
print(f"The lower quartile of temprature is: {lowerq}")
print(f"The upper quartile of temprature is: {upperq}")
print(f"The interquartile range of temprature is: {iqr}")
print(f"The median of temprature is: {quartiles[0.5]}")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")



The lower quartile of temprature is: 32.37735684
The upper quartile of temprature is: 40.1592203
The interquartile range of temprature is: 7.781863460000004
The median of temprature is: 37.31184577
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [28]:
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume in Capomulin Regimen")
ax1.set_ylabel("Final Tumor Volume(mm3)")
ax1.boxplot(cap_obj)
plt.show()

<IPython.core.display.Javascript object>

In [29]:
ram_df = cap_data_df = max_df[max_df["Drug Regimen"].isin(["Ramicane"])]
ram_df.head().reset_index()

Unnamed: 0,index,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,1773,a411,45,38.407618,1,Ramicane,Male,3,22
1,1836,a444,45,43.047543,0,Ramicane,Female,10,25
2,1857,a520,45,38.810366,1,Ramicane,Male,13,21
3,1879,a644,45,32.978522,1,Ramicane,Female,7,17
4,1822,c758,45,33.397653,1,Ramicane,Male,9,17


In [30]:
ram_obj = ram_df.sort_values(["Tumor Volume (mm3)"], ascending=True).reset_index()
ram_obj["Tumor Volume (mm3)"]

0     22.050126
1     29.128472
2     30.276232
3     30.564625
4     30.638696
5     31.095335
6     31.560470
7     32.978522
8     33.397653
9     33.562402
10    36.134852
11    36.374510
12    37.225650
13    37.311236
14    38.407618
15    38.810366
16    40.659006
17    40.667713
18    43.047543
19    45.220869
Name: Tumor Volume (mm3), dtype: float64

In [31]:
quartiles = cap_obj.quantile([.25,.5,.75])
upperq = quartiles[0.75]
lowerq = quartiles[0.25]
iqr = upperq - lowerq

In [32]:
print(f"The lower quartile of temprature is: {lowerq}")
print(f"The upper quartile of temprature is: {upperq}")
print(f"The interquartile range of temprature is: {iqr}")
print(f"The median of temprature is: {quartiles[0.5]}")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")


The lower quartile of temprature is: 32.37735684
The upper quartile of temprature is: 40.1592203
The interquartile range of temprature is: 7.781863460000004
The median of temprature is: 37.31184577
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [33]:
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume in Ramicane Regimen")
ax1.set_ylabel("Final Tumor Volume(mm3)")
ax1.boxplot(cap_obj)
plt.show()

<IPython.core.display.Javascript object>

In [34]:
infu_df = max_df[max_df["Drug Regimen"].isin(["Infubinol"])]
infu_df.head().reset_index()

Unnamed: 0,index,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,1804,a203,45,67.973419,2,Infubinol,Female,20,23
1,1786,a251,45,65.525743,1,Infubinol,Female,21,25
2,1794,a685,45,66.083066,3,Infubinol,Male,8,30
3,1781,c139,45,72.226731,2,Infubinol,Male,11,28
4,1802,e476,45,62.435404,1,Infubinol,Male,23,26


In [35]:
infu_obj = infu_df.sort_values(["Tumor Volume (mm3)"], ascending=True).reset_index()
infu_obj = infu_obj["Tumor Volume (mm3)"]
infu_obj

0    60.918767
1    62.435404
2    62.754451
3    65.525743
4    66.083066
5    66.196912
6    67.685569
7    67.973419
8    72.226731
Name: Tumor Volume (mm3), dtype: float64

In [36]:
quartiles = infu_obj.quantile([.25,.5,.75])
upperq = quartiles[0.75]
lowerq = quartiles[0.25]
iqr = upperq - lowerq

In [37]:
print(f"The lower quartile of temprature is: {lowerq}")
print(f"The upper quartile of temprature is: {upperq}")
print(f"The interquartile range of temprature is: {iqr}")
print(f"The median of temprature is: {quartiles[0.5]}")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")


The lower quartile of temprature is: 62.75445141
The upper quartile of temprature is: 67.68556862
The interquartile range of temprature is: 4.9311172099999965
The median of temprature is: 66.08306589
Values below 55.35777559500001 could be outliers.
Values above 75.08224443499999 could be outliers.


In [38]:
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume in Ramicane Regimen")
ax1.set_ylabel("Final Tumor Volume(mm3)")
ax1.boxplot(cap_obj)
plt.show()

<IPython.core.display.Javascript object>

In [39]:
cert_df = max_df[max_df["Drug Regimen"].isin(["Ceftamin"])]
cert_df.head().reset_index()

Unnamed: 0,index,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,1856,a275,45,62.999356,3,Ceftamin,Female,20,28
1,1772,j296,45,61.849023,3,Ceftamin,Female,24,30
2,1858,k210,45,68.923185,3,Ceftamin,Male,15,28
3,1810,l471,45,67.748662,1,Ceftamin,Female,7,28
4,1765,l661,45,59.851956,3,Ceftamin,Male,18,26


In [40]:
cert_obj = cert_df.sort_values(["Tumor Volume (mm3)"], ascending=True).reset_index()
cert_obj = cert_obj["Tumor Volume (mm3)"]
cert_obj

0     59.741901
1     59.851956
2     61.386660
3     61.433892
4     61.849023
5     62.999356
6     64.299830
7     64.634949
8     64.729837
9     67.527482
10    67.748662
11    68.594745
12    68.923185
Name: Tumor Volume (mm3), dtype: float64

In [41]:
quartiles = cert_obj.quantile([.25,.5,.75])
upperq = quartiles[0.75]
lowerq = quartiles[0.25]
iqr = upperq - lowerq

In [42]:
print(f"The lower quartile of temprature is: {lowerq}")
print(f"The upper quartile of temprature is: {upperq}")
print(f"The interquartile range of temprature is: {iqr}")
print(f"The median of temprature is: {quartiles[0.5]}")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

The lower quartile of temprature is: 61.43389223
The upper quartile of temprature is: 67.52748237
The interquartile range of temprature is: 6.093590140000003
The median of temprature is: 64.29983003
Values below 52.29350701999999 could be outliers.
Values above 76.66786758 could be outliers.


In [43]:
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume in Ramicane Regimen")
ax1.set_ylabel("Final Tumor Volume(mm3)")
ax1.boxplot(cap_obj)
plt.show()

<IPython.core.display.Javascript object>

In [44]:
capomulin_df = new_df.loc[new_df["Drug Regimen"] == "Capomulin"]
caponulin_df = capomulin_df.reset_index()
capomulin_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
226,j246,0,45.0,0,Capomulin,Female,21,21
227,r554,0,45.0,0,Capomulin,Female,8,17
228,s185,0,45.0,0,Capomulin,Female,3,17
229,b742,0,45.0,0,Capomulin,Male,7,21


In [45]:
cap_mouse = capomulin_df.loc[capomulin_df["Mouse ID"] =="s185"]
cap_mouse

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
228,s185,0,45.0,0,Capomulin,Female,3,17
447,s185,5,43.878496,0,Capomulin,Female,3,17
671,s185,10,37.614948,0,Capomulin,Female,3,17
714,s185,15,38.177232,0,Capomulin,Female,3,17
968,s185,20,36.866876,0,Capomulin,Female,3,17
1196,s185,25,33.94994,0,Capomulin,Female,3,17
1377,s185,30,32.959671,1,Capomulin,Female,3,17
1479,s185,35,28.328531,1,Capomulin,Female,3,17
1652,s185,40,25.472143,1,Capomulin,Female,3,17
1849,s185,45,23.343598,1,Capomulin,Female,3,17


In [48]:
cap_mouse = cap_mouse.loc[:, ["Timepoint", "Tumor Volume (mm3)"]]
cap_mouse = cap_mouse.reset_index(drop=True)
cap_mouse.set_index("Timepoint").plot(figsize=(5,4), linewidth=2.5, color="blue")

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='Timepoint'>

In [49]:
capomulin_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
226,j246,0,45.0,0,Capomulin,Female,21,21
227,r554,0,45.0,0,Capomulin,Female,8,17
228,s185,0,45.0,0,Capomulin,Female,3,17
229,b742,0,45.0,0,Capomulin,Male,7,21


In [50]:
weight_df = capomulin_df.loc[:, ["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]]
weight_df.head()

Unnamed: 0,Mouse ID,Weight (g),Tumor Volume (mm3)
0,b128,22,45.0
226,j246,21,45.0
227,r554,17,45.0
228,s185,17,45.0
229,b742,21,45.0


In [51]:
avg_capo = pd.DataFrame(weight_df.groupby(["Mouse ID", "Weight (g)"])["Tumor Volume (mm3)"].mean()).reset_index()
avg_capo.head()

Unnamed: 0,Mouse ID,Weight (g),Tumor Volume (mm3)
0,b128,22,41.963636
1,b742,21,40.083699
2,f966,17,36.505973
3,g288,19,41.990097
4,g316,22,44.613344


In [52]:
avg_capo = avg_capo.rename(columns={"Tumor Volume (mm3)": "Average Volume"})
avg_capo.head()

Unnamed: 0,Mouse ID,Weight (g),Average Volume
0,b128,22,41.963636
1,b742,21,40.083699
2,f966,17,36.505973
3,g288,19,41.990097
4,g316,22,44.613344


In [53]:
avg_capo.plot(kind="scatter", x="Weight (g)", y="Average Volume", grid=True, figsize=(6,4), title="Weight vs Average Tumor Volume")
plt.show()

<IPython.core.display.Javascript object>

In [54]:
plt.clf()
plt.cla()
plt.close()

In [55]:
mouse_weight=avg_capo.iloc[:,0]
avg_tumor_volume = avg_capo.iloc[:,1]

correlation =(mouse_weight,avg_tumor_volume)
# print(f"The correlation between the two factors is{round(correlation[0],2)}")

In [56]:
from scipy.stats import linregress

x_values = avg_capo["Weight (g)"]
y_values = avg_capo["Average Volume"]
(slope, intercept, rvalues,pvalue, stderr) = linregress(x_values, y_values)
line_eq = "y =" + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, marker="o",facecolors="red", edgecolors="black", alpha=0.75)
plt.plot(x_values, "r-")
plt.annotate(line_eq,(6,10), fontsize=15, color="red")
plt.xlabel("Mouse Weight")
plt.ylabel("Average Tumor Volume")
plt.scatter(x_values, y_values)


<matplotlib.collections.PathCollection at 0x20ea9577c10>

In [57]:
plt.ylim(0, 1)

(0.0, 1.0)

In [58]:
plt.xlim(0, 100)

(0.0, 100.0)

In [59]:
plt.show()