# Final Analysis

### Based on the data presented below, Capomulin seems to be the best drug of choice for dealing with tumors as it had a great success rate in reducing tumor sizes.

### Infubinol may have had an outlier when doing boxplots that was much lower than the rest of the data, it may need to be checked again for it's accuracy. Considering of the 4 that were box plotted that it seemed to have the highest tumor volumes I believe it is an outlier.

### I also believe Ramicane may also make for a suitable drug regimen as it seemed comparable to Capomulin based on the box plots below.

In [None]:
#Import Dependencies
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

In [None]:
#Creating DataFrame from Files

#Files
file1 = "Resources/Mouse_metadata.csv"
file2 = "Resources/Study_results.csv"

mouse_data = pd.read_csv(file1)
study_results = pd.read_csv(file2)

mouse_df = pd.DataFrame(mouse_data)
#mouse_df

results_df = pd.DataFrame(study_results)
#results_df

merged_df = pd.merge(results_df,mouse_df,on = "Mouse ID")
#merged_df

In [None]:
#Find total number of rats
rats = merged_df["Mouse ID"].unique()
rats = rats.tolist()
len(rats)

In [None]:
#Find duplicate rat data
duplicate_rats = merged_df[["Mouse ID","Timepoint"]].duplicated()
duplicate_rats = merged_df.loc[duplicate_rats]
#duplicate_rats

#Get all the data for the duplicated rat
all_duplicated_rat = merged_df.loc[merged_df["Mouse ID"] == "g989"]
all_duplicated_rat

In [None]:
#Creating clean DataFrame without duplicate mouse
cleaned_df = merged_df.loc[merged_df["Mouse ID"] != "g989"]
#cleaned_df

In [None]:
#Check number of rats after drop
cleaned_rats = cleaned_df["Mouse ID"].unique()
cleaned_rats = cleaned_rats.tolist()
len(cleaned_rats)

In [None]:
#Create a statistics table for each regimen
regimen_groupby = cleaned_df.groupby("Drug Regimen")[["Tumor Volume (mm3)"]]

regimen_table =regimen_groupby.describe()
#regimen_table

regimen_mean = regimen_groupby.mean()
#regimen_mean

regimen_median = regimen_groupby.median()
#regimen_median

regimen_variance = regimen_groupby.var()
#regimen_variance

regimen_deviation = regimen_groupby.std()
#regimen_deviation

regimen_standard = regimen_groupby.sem()
#regimen_standard

In [None]:
#Create a DataFrame for the groupbys
regimen_df = pd.DataFrame(regimen_mean)
regimen_df.rename(columns=({"Tumor Volume (mm3)" : "Mean (mm3)"}),inplace=True)
regimen_df["Median (mm3)"] = regimen_median
regimen_df["Variance (mm3)"] = regimen_variance
regimen_df["Standard Deviation (mm3)"] = regimen_deviation
regimen_df["Standard Error of Measure (mm3)"] = regimen_standard
regimen_df

In [None]:
#Create a DataFrame from aggregation
regimen_sum = cleaned_df.groupby("Drug Regimen")
summary_table = regimen_sum.agg(["mean","median","var","std","sem"])[["Tumor Volume (mm3)"]]
summary_table

In [None]:
#Create Bar Graphs
bar_data = cleaned_df.groupby("Drug Regimen")["Timepoint"].count()

x_axis = np.arange(len("Drug Regimen"))

bar_data.plot.bar(figsize=(12,10), fontsize = 14)

plt.xlabel("Drug Regimen",fontsize = 14)
plt.ylabel("Number of Timepoints",fontsize = 14)
plt.title("Total Timepoints for each Regimen",fontsize = 16)
plt.tight_layout()
plt.savefig("Images/bar1.jpeg")
plt.show()

In [None]:
#Set data to a list
bar_data = bar_data.tolist()

In [None]:
#Same graph with pyplot
x_axis = np.arange(len(bar_data))

fig1, ax1 = plt.subplots(figsize=(12, 10))

plt.bar(x_axis, bar_data, color='c', align='center')

tick_locations = [value for value in x_axis]

plt.xticks(tick_locations, ['Capomulin', 'Ceftamin', 'Infubinol',
                            'Ketapril', 'Naftisol', 'Placebo',
                            'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol'])

plt.xlim(-0.75, len(x_axis)+0.25)

plt.ylim(0, max(bar_data)+10)

plt.xlabel("Drug Regimen",fontsize = 14)
plt.ylabel("Number of Timepoints",fontsize = 14)
plt.title("Total Timepoints for each Regimen",fontsize = 16)
plt.tight_layout()
plt.savefig("Images/bar2.jpeg")
plt.show()

In [None]:
#Create Data for Pie Charts
gender_data = cleaned_df.groupby(["Mouse ID","Sex"])
#gender_data

#Create DataFrame to groupby
gender_df = pd.DataFrame(gender_data.size())
#gender_df

#Clean up the numbers 
rat_gender = pd.DataFrame(gender_df.groupby(["Sex"]).count())
rat_gender.columns = ["Total Rats"]
#rat_gender

# Create the percentage of female vs male
rat_gender["Percentage of Rats"] = (100*(rat_gender["Total Rats"]/rat_gender["Total Rats"].sum()))
#rat_gender

In [None]:
#Create Pie Chart from data
labels = ["Female","Male"]
size = rat_gender["Total Rats"].tolist()
colors = ["coral","dodgerblue"]
explode = [0.1,0]

rat_gender.plot.pie(y="Total Rats",figsize=(8,8),explode=explode,autopct="%1.1f%%",colors=colors)
plt.tight_layout()
plt.savefig("Images/pie1.jpeg")
plt.show()

In [None]:
#Create Pie Chart with pyplot
labels = ["Female","Male"]
size = rat_gender["Total Rats"].tolist()
colors = ["salmon","dodgerblue"]
explode = [0.1,0]

plt.pie(size, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", startangle=140)
plt.tight_layout
plt.savefig("Images/pie2.jpeg")
plt.show()

In [None]:
#Grab the final Tumor Volume 
stats_df = cleaned_df[cleaned_df["Drug Regimen"] != 'Ketapril']
stats_df = stats_df[stats_df["Drug Regimen"] != 'Naftisol']
stats_df = stats_df[stats_df["Drug Regimen"] != 'Stelasyn']
stats_df = stats_df[stats_df["Drug Regimen"] != 'Zoniferol']
stats_df = stats_df[stats_df["Drug Regimen"] != 'Propriva']
stats_df = stats_df[stats_df["Drug Regimen"] != 'Placebo']
#stats_df

#Grab the last Timepont data and greatest tumor volume
last_time = stats_df.groupby(["Mouse ID","Drug Regimen"])[["Timepoint"]].max()
last_df = pd.DataFrame(last_time)

#Merge back into DataFrame
plot_df = pd.merge(stats_df,last_df, on=["Mouse ID","Timepoint"],how="right")
#plot_df.head(50)

In [None]:
#Put treatments in a list
treatments = stats_df["Drug Regimen"].unique()
treatments = treatments.tolist()
#treatments

#Create empty lists for tumor info
capomulin_data = []
infubinol_data = []
ramicane_data = []
ceftamin_data = []

#Run for loop to collect info
for index, row in plot_df.iterrows():
    
    if row["Drug Regimen"] == treatments[0]:
        
        capomulin_data.append(row['Tumor Volume (mm3)'])
        
    elif row["Drug Regimen"] == treatments[1]:
        
        infubinol_data.append(row['Tumor Volume (mm3)'])
        
    elif row["Drug Regimen"] == treatments[2]:
        
        ramicane_data.append(row['Tumor Volume (mm3)'])
        
    else:
        
        ceftamin_data.append(row['Tumor Volume (mm3)'])

#capomulin_data

#infubinol_data

#ramicane_data

#ceftamin_data

In [None]:
#Create DataFrame from lists
treatment_data = ({
    treatments[0] : capomulin_data,
    treatments[1] : infubinol_data,
    treatments[2] : ramicane_data,
    treatments[3] : ceftamin_data
})
treatment_df = pd.DataFrame(treatment_data)
#treatment_df

In [None]:
#Huge Shoutout to our lesson plan having this cookie cutter in it
#Capomulin Stats
quartiles1 = treatment_df[treatments[0]].quantile([.25,.5,.75])
lowerq1 = quartiles1[0.25]
upperq1 = quartiles1[0.75]
iqr1 = upperq1-lowerq1

print(f"The lower quartile of {treatments[0]} is: {lowerq1}")
print(f"The upper quartile of {treatments[0]} is: {upperq1}")
print(f"The interquartile range of {treatments[0]} is: {iqr1}")
print(f"The the median of {treatments[0]} is: {quartiles1[0.5]} ")

lower_bound1 = lowerq1 - (1.5*iqr1)
upper_bound1 = upperq1 + (1.5*iqr1)
print(f"Values below {lower_bound1} could be outliers.")
print(f"Values above {upper_bound1} could be outliers.")

print(" ")
print("=============================================================")
print(" ")

#Infubinol Stats
quartiles2 = treatment_df[treatments[1]].quantile([.25,.5,.75])
lowerq2 = quartiles2[0.25]
upperq2 = quartiles2[0.75]
iqr2 = upperq2-lowerq2

print(f"The lower quartile of {treatments[1]} is: {lowerq2}")
print(f"The upper quartile of {treatments[1]} is: {upperq2}")
print(f"The interquartile range of {treatments[1]} is: {iqr2}")
print(f"The the median of {treatments[1]} is: {quartiles2[0.5]} ")

lower_bound2 = lowerq2 - (1.5*iqr2)
upper_bound2 = upperq2 + (1.5*iqr2)
print(f"Values below {lower_bound2} could be outliers.")
print(f"Values above {upper_bound2} could be outliers.")

print(" ")
print("=============================================================")
print(" ")

#Ramicane Stats
quartiles3 = treatment_df[treatments[2]].quantile([.25,.5,.75])
lowerq3 = quartiles3[0.25]
upperq3 = quartiles3[0.75]
iqr3 = upperq3-lowerq3

print(f"The lower quartile of {treatments[2]} is: {lowerq3}")
print(f"The upper quartile of {treatments[2]} is: {upperq3}")
print(f"The interquartile range of {treatments[2]} is: {iqr3}")
print(f"The the median of {treatments[2]} is: {quartiles3[0.5]} ")

lower_bound3 = lowerq3 - (1.5*iqr3)
upper_bound3 = upperq3 + (1.5*iqr3)
print(f"Values below {lower_bound3} could be outliers.")
print(f"Values above {upper_bound3} could be outliers.")

print(" ")
print("=============================================================")
print(" ")

#Ceftamin Stats
quartiles4 = treatment_df[treatments[3]].quantile([.25,.5,.75])
lowerq4 = quartiles4[0.25]
upperq4 = quartiles4[0.75]
iqr4 = upperq4-lowerq4

print(f"The lower quartile of {treatments[3]} is: {lowerq4}")
print(f"The upper quartile of {treatments[3]} is: {upperq4}")
print(f"The interquartile range of {treatments[3]} is: {iqr4}")
print(f"The the median of {treatments[3]} is: {quartiles4[0.5]} ")

lower_bound4 = lowerq4 - (1.5*iqr4)
upper_bound4 = upperq4 + (1.5*iqr4)
print(f"Values below {lower_bound4} could be outliers.")
print(f"Values above {upper_bound4} could be outliers.")

In [None]:
#Create Boxplot for the 4 regimens
treatment_df.boxplot()

plt.title("Boxplot of Regimens")
plt.xlabel("Drug Regimen")
plt.ylabel("Tumor Volume in mm3")
plt.tight_layout
plt.show()

In [None]:
#Create a lineplot of a single mouse on Capomulin
capomulin_df = stats_df[stats_df["Drug Regimen"] == treatments[0]]
b128_data = capomulin_df[capomulin_df["Mouse ID"] == "b128"]

x_values = b128_data["Timepoint"]
y_values = b128_data["Tumor Volume (mm3)"]
plt.plot(x_values,y_values)
plt.title("Tumor Volume over Days")
plt.xlabel("Timepoint in Days")
plt.ylabel("Tumor Volume in mm3")
plt.tight_layout
plt.show()

In [None]:
#Create a Scatter plot of the average tumor weight compared to mouse weight
capomulin_summary = capomulin_df.groupby("Mouse ID").mean()

#Scatter Plot
x_values = capomulin_summary["Tumor Volume (mm3)"]
y_values = capomulin_summary["Weight (g)"]
plt.scatter(x_values,y_values)
plt.title("Tumor Volume compared to Average Weight")
plt.xlabel("Weight in Grams (g)")
plt.ylabel("Tumor Volume in mm3")

#Linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.plot(x_values,regress_values,"r-")
plt.tight_layout
plt.show()

In [None]:
#Correlation Coeffcient for the graph
print(f"The correlation coefficient between Weight (g) and Tumor Volume (mm3) is {round(st.pearsonr(x_values,y_values)[0],2)}")