# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
study_result = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
pymerged_df = pd.merge(study_results, study_result, how="left", on=["Mouse ID"])
# Display the data table for preview
pymerged_df.head()

In [None]:
# Checking the number of mice.
mousecount = len(pymerged_df["Mouse ID"].unique())
mousecount

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dupe_df = pymerged_df[pymerged_df[["Mouse ID", "Timepoint"]].duplicated()]
dupe_df["Mouse ID"]

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
dupemi_df = pymerged_df.loc[pymerged_df["Mouse ID"] == "g989"]
dupemi_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleanmice_df = pymerged_df.loc[pymerged_df["Mouse ID"] != "g989"]
cleanmice_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
cleanmousecount = len(cleanmice_df["Mouse ID"].unique())
cleanmousecount

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
regimen_df = pd.DataFrame(cleanmice_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].describe())
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.
#regimen_df


In [None]:
regimen2_df = pd.DataFrame(cleanmice_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median())
#regimen2_df


In [None]:
drugmerged_df = pd.merge(regimen_df, regimen2_df, how="left", on=["Drug Regimen"])
#drugmerged_df

In [None]:
drugmerged2_df = drugmerged_df.drop(["count", "min", "25%", "50%", "75%", "max"], axis=1)
#drugmerged2_df 

In [None]:
drugmerged3_df = drugmerged2_df.rename(columns={"mean":"Mean Tumor Volume", "std":"Tumor Volume Std.Dev", "Tumor Volume (mm3)":"Median Tumor Volume"})
drugmerged3_df                                         

In [None]:
drugmerged4_df = drugmerged3_df[["Mean Tumor Volume", "Median Tumor Volume", "Tumor Volume Std.Dev"]]  
#drugmerged4_df.head()

In [None]:
variance_df = pd.DataFrame(cleanmice_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var())
variance_df = variance_df.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Variance"})
#variance_df

In [None]:
std_df = pd.DataFrame(cleanmice_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std())
std_df = std_df.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Std.Err"})
#std_df

In [None]:
drugmerged5_df = pd.merge(drugmerged4_df, variance_df, how="left", on=["Drug Regimen"])
#drugmerged5_df

In [None]:
drugmerged6_df = pd.merge(drugmerged5_df, std_df, how="left", on=["Drug Regimen"])
#drugmerged6_df

In [None]:
drfinal_df = drugmerged6_df[["Mean Tumor Volume", "Median Tumor Volume", "Tumor Volume Variance", "Tumor Volume Std.Dev", "Tumor Volume Std.Err"]]
drfinal_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line.


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
cleanmice2_df = pd.DataFrame(cleanmice_df.groupby("Drug Regimen", )["Timepoint"].count())
#cleanmice2_df


In [None]:
cleanmice2_df.plot(kind="bar", legend=0)
plt.ylabel("Number of Mice Tested")
plt.show()                                                

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
micetot = [230, 178, 178, 188, 186, 181, 148, 228, 181, 182]
xdrugs = ["Capomulin", "Ceftamin", "Infubinol", "Ketapril", "Naftisol", "Placebo", "Propriva", "Ramicane", "Stelasyn", "Zoniferol"]
plt.bar(xdrugs, micetot, color="b", align="center")
plt.xticks(rotation='vertical')
plt.ylabel("Number of Mice Tested")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_df = pd.DataFrame(cleanmice_df.groupby("Sex")["Sex"].count()).reset_index(names="Gender")
#gender_df
sortedgen_df = gender_df.sort_values(["Sex"], ascending=False)
sortedgen_df = sortedgen_df.set_index("Gender")
sortedgen_df
gender_pie = sortedgen_df.plot(kind="pie", x="Gender", y="Sex", legend=0, autopct="%1.1f%%")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender = ["Male", "Female"]
sextot = [956, 922] 
plt.pie(sextot, labels=gender, autopct="%1.1f%%")        
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
time_df = pd.DataFrame(cleanmice_df.groupby("Mouse ID", )["Timepoint"].max())
#time_df
# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
mergtime_df = pd.merge(time_df, cleanmice_df, how="left", on=["Mouse ID", "Timepoint"])
mergtime_df = mergtime_df.set_index("Mouse ID")
mergtime_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drugs = mergtime_df["Drug Regimen"].unique()
#drugs

# Create empty list to fill with tumor vol data (for plotting)
captumorvol = []
ramtumorvol = []
inftumorvol = []
ceftumorvol = []
#other = []
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
#for i in mergtime_df
    #if ["Tumor Volume (mm3)] == "Capomulin"
    #captumorvol.append["Tumor Volume (mm3)]
    
    #elif ["Tumor Volume (mm3)] == "Ramicane"
    #ptumorvol.append["Tumor Volume (mm3)"]
                       
    #elif ["Tumor Volume (mm3)] == "Infubinol"
    #captumorvol.append["Tumor Volume (mm3)"]                  
    
    #elif ["Tumor Volume (mm3)] == "Ceftamin"
    #captumorvol.append["Tumor Volume (mm3)"]
                       
    #else 
    #other .append["Tumor Volume (mm3)]                  
 #could not get this to work this way                      
  
              
                       
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
#Have tried numerous methgods to create a Dataframe or Dictionary that contains the tumor volume for all 4 drugs in question but after spending hours on this I need to move on for now 

# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
#Capomulin_df = mergtime_df.loc[(mergtime_df["Drug Regimen"] == "Capomulin")]

#Ramicane_df = mergtime_df.loc[(mergtime_df["Drug Regimen"] == "Ramicane")]
#Infubinol_df = mergtime_df.loc[(mergtime_df["Drug Regimen"] == "Infubinol")]
#Ceftamin_df = mergtime_df.loc[(mergtime_df["Drug Regimen"] == "Ceftamin")]
#captumorvol_df = pd.DataFrame(Capomulin_df["Tumor Volume (mm3)"])
#ramtumorvol_df = pd.DataFrame(Ramicane_df["Tumor Volume (mm3)"])
#inftumorvol = Infubinol_df["Tumor Volume (mm3)"]
#ceftumorvol = Infubinol_df["Tumor Volume (mm3)"]        
#captumorvol_df["Ramicane"] = ramtumorvol_df.loc[ramtumorvol_df["Tumor Volume (mm3)"]]

#captumorvol_df[
#new_df["Capomulin"] = Capomulin_df.loc[Capomulin_df["Tumor Volume (mm3)"]]
#Capomulin_df["Ramicane"] = Ramicane_df["Tumor Volume (mm3)"]
#Capomulin_df["Infubinol"] = Infubinol_df["Tumor Volume (mm3)"]
#apomulin_df["Ceftamin"] = Infubinol_df["Tumor Volume (mm3)"]
#new_df
#fig1, ax1 = plt.subplots()
#ax1.set_ylabel("Final Tumor Volume (mm3))
#ax1.boxplot(captumorvol)
#ax1.boxplot(ramtumorvol)
#ax1.boxplot(inftumorvol)
#ax1.boxplot(ceftumorvol)
#plt.show               

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
Capoprep_df = pymerged_df.loc[(pymerged_df["Drug Regimen"] == "Capomulin")]
#Capoprep_df
Capoprep_df = Capoprep_df.sort_values(["Mouse ID"], ascending=False)
b128_df = Capoprep_df.loc[(pymerged_df["Mouse ID"] == "b128")]
Capomulin_df = b128_df.set_index("Mouse ID")
Capomulin_df = Capomulin_df.sort_values(["Timepoint"])
#Capomulin_df

In [None]:
#xxxx = Capomulin_df.loc["Timepoint"]
#Capomulin_df.loc["Tumor Volume (mm3)"])
cmplot = plt.plot(Capomulin_df["Timepoint"], Capomulin_df["Tumor Volume (mm3)"])
plt.title("Capomulin treatment of mouse b128")
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
cleanmice2_df = pd.DataFrame(Capoprep_df.groupby("Mouse ID")["Tumor Volume (mm3)"].mean())
#cleanmice2_df

In [None]:
cleanmice3_df = pd.DataFrame(Capoprep_df.groupby("Mouse ID")["Weight (g)"].mean())
#cleanmice3_df

In [None]:
cleanmicemerge_df = pd.merge(cleanmice2_df, cleanmice3_df, how="left", on=["Mouse ID"])
#cleanmicemerge_df

In [None]:
plt.scatter(cleanmicemerge_df["Weight (g)"], cleanmicemerge_df["Tumor Volume (mm3)"])
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

#coudnt get regression line to work

#slope, intercept, rvalue, pvalue, sterr) = linregress(cleanmicemerge_df["Weight (g)"], cleanmicemerge_df["Tumor Volume (mm3)"])
#regress_values = cleanmicemerge_df["Weight (g)"] * slope + intercept
#line_eq = "y =" +str(round(slope,2)) + "x + " + str(round(intercept,2))
#plt.plot(cleanmicemerge_df["Weight (g)"],regress_values,"r-")                                                                                                         
#plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
weight = cleanmicemerge_df["Weight (g)"]
tumvol = cleanmicemerge_df["Tumor Volume (mm3)"]
correlation = st.pearsonr(weight,tumvol)
plt.scatter(cleanmicemerge_df["Weight (g)"], cleanmicemerge_df["Tumor Volume (mm3)"])
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation[0],2)}")
plt.show()