## Observations and Insights 

In [None]:
## SAVE TEST

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
mouse_metadata.head()

In [None]:
study_results.head()

In [None]:
# Combine the data into a single dataset
merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
merged_df.head()

In [None]:
# Checking the number of mice.
subjectCt = (merged_df["Mouse ID"]).nunique()
print(subjectCt)

In [None]:
duplicates = merged_df.loc[merged_df.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False),'Mouse ID']

In [None]:
print(duplicates)

## <span style="color: black;">   Fig 1.1  All Data for Duplicate Mouse ID

In [None]:
#List all data for duplicate mice by ID
merged_df.loc[merged_df['Mouse ID'] == 'g989', :]

In [None]:
## Assign Variables to each relevant Series in the DataFrame
tPt = merged_df['Timepoint']
tVol = merged_df['Tumor Volume (mm3)']
sex = merged_df['Sex']
dReg = merged_df['Drug Regimen']
wt = merged_df['Weight (g)']

## Assign Treatments to a List
scrip_list = merged_df['Drug Regimen'].unique()
scrip_list

In [None]:
# Drop all the rows for the mouse with duplicates
# merged_df.drop(index=[908 to 920], inplace = True)
merged_df.drop(index=[908,909,910,911,912,913,914,915,916,917,918,919,920], inplace=True)

## <span style="color: black;">Corrected Subject Count

In [None]:
# Check the number of mice in the cleaned up DataFrame.
subjectCt = (merged_df["Mouse ID"]).nunique()
print(subjectCt)

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
drug_reg = merged_df.groupby(['Drug Regimen'])
avgVol = drug_reg[['Tumor Volume (mm3)']].mean()
medVol = drug_reg[['Tumor Volume (mm3)']].median()
varVol = drug_reg[['Tumor Volume (mm3)']].var()
stdVol = drug_reg[['Tumor Volume (mm3)']].std()
semVol = drug_reg[['Tumor Volume (mm3)']].sem()

## <span style="color: black;">Average Tumor Volume Summary

In [None]:
avgVol.head(10)

## <span style="color: black;">Median Tumor Volume Summary

In [None]:
medVol.head(10)

## <span style="color: black;">Variance Tumor Volume Summary

In [None]:
varVol.head(10)

## <span style="color: black;">Standard Deviation Tumor Volume Summary

In [None]:
stdVol.head(11)

## <span style="color: black;">Standard Error of the Mean Tumor Volume Summary

In [None]:
semVol.head(11)

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line

## <span style="color: black;">Summary Table of Above data using Aggregate method

In [None]:
drug_reg.agg( {'Tumor Volume (mm3)': ["mean", "median", "var", "std", "sem"]})

## Bar and Pie Charts

In [None]:
# Reduce Drug Groupby to only Timepoint Data and plot bars
drug_tPt = drug_reg[['Timepoint']].count()

In [None]:
tPts_bar = drug_tPt.plot(kind='bar')

In [None]:
drug_tPt

In [None]:
scrip_list

In [None]:
points = drug_tPt['Timepoint']
points

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
#drugs = drug_reg.index  --- FIX THIS VARIABLE

plt.title ("Timepoints")
plt.xlabel ("Drug Regimen")
plt.xticks (rotation="vertical")

plt.bar(scrip_list, points, facecolor='blue')#, labels='vertical')
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
merged_df["Sex"].value_counts().plot(kind='pie')

In [None]:
# Set Variables for Pie Chart
labels = ['Male', 'Female']
sizes = merged_df['Sex'].value_counts()
colors = ['blue', 'yellow']
explode = [0, 0.1]

In [None]:
sizes

In [None]:
plt.pie(sizes, explode= explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, \
        startangle=0)
plt.axis('equal')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Group the orig Cleaned Dataframe by Subject and Greatest Timepoint
max_tPts = merged_df.groupby('Mouse ID').max()['Timepoint']

In [None]:
# Merge max timepoint data with orig clean dataframe on Subject ID
max_tPts = pd.merge(max_tPts, merged_df, on=['Mouse ID', 'Timepoint'], how='outer')

In [None]:
max_tPts.head(1)

In [None]:
# List drug regimens of priority; Capomulin, Ramicane, Infubinol, and Ceftamin
# & start columns for empty dataframe
prScrips = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
maxVList = []
outlierLst = []
maxVol_df = pd.DataFrame()

In [None]:
#Create truncated df for Boxplot
box_df = merged_df.loc[merged_df['Drug Regimen'].isin(['Capomulin', 'Ramincane', 'Infubinol', 'Ceftamin'])]
box_df

In [None]:
# Create forLoop to iterate last timepoints and gather max volumes
for index, scrip in enumerate(prScrips):
    
    # Start by getting the last (greatest) timepoint for each mouse and fill new dataframe list
    lastVol = max_tPts.loc[max_tPts['Drug Regimen'] == scrip, :]
    
    maxVList.append(lastVol['Tumor Volume (mm3)'])

In [None]:
## Plot max volumes, for each of the 4 regimens of interest in box plots

In [None]:
fig1 = ax1 = plt.subplots()
ax1.set_title("Capomulin")
ax1.set_ylabel(Max. Volumes)
ax1.boxplot()

In [None]:
    # Parse data into Quartiles, and append table with outlier counts
    quartile = maxVList[index].quantile([.25, .5, .75])
    maxVol_df.loc[scrip, 'Qu.1'] = quartile[.25]
    maxVol_df.loc[scrip, 'H_Spread'] = quartile[.5]
    maxVol_df.loc[scrip, 'Qu.3'] = quartile[.75]
    maxVol_df.loc[scrip, 'H_Width'] = quartile[.75] - quartile [.25]
    

In [None]:
    #Set Variables for Outliers
    lFence = quartile[.25] - 1.5 * (quartile[.75] - quartile [.25])
    uFence = quartile[.75] + 1.5 * (quartile[.75] - quartile [.25])
    ifOut = lastVol['Tumor Volume (mm3)']

In [None]:
ifOut

# <span style="color: blue;">CURRENTLY</span> <span style="color: green;">WORKING</span> <span style="color: red;">HERE</span>

In [None]:
    maxVol_df.loc[scrip, 'Lower Fence'] = quartile[.25] - 1.5 * (quartile[.75] - quartile [.25]) #lFence
    maxVol_df.loc[scrip, 'Upper Fence'] = quartile[.25] - 1.5 * (quartile[.75] - quartile [.25]) #uFence
    
    outlier = lastVol.loc[(lastVol['Tumor Volume (mm3)'] > maxVol_df.loc[quartile[.75] + \
        1.5 * (quartile[.75] - quartile [.25], :) | lastVol.loc(lastVol['Tumor Volume (mm3)'] \
        < quartile[.25] - 1.5 * (quartile[.75] - quartile [.25]), :).count()

In [None]:
    maxVol_df.loc[scrip, 'Outlier Count'] = lastVol.loc[(lastVol['Tumor Volume (mm3)'] > maxVol_df.loc[quartile[.75] + 1.5 * (quartile[.75] - quartile [.25]) ]).count() # | (lastVol['Tumor Volume (mm3)']) < (lFence)]    

In [None]:
    #Fill new datafram list for outliers
    outliers = 

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig, (ax0, ax1, ax2, ax3) = plst.subplots(ncols=4, sharey=True)
ax0.boxplot(maxVList)[0], labels=['Capomulin'], flierprops={'markerfacecolor':'purple','marker':'o'}
ax1.boxplot(maxVList)[1], labels=['Ramicane'], flierprops={'markerfacecolor':'purple','marker':'o'}
ax2.boxplot(maxVList)[2], labels=['Infubinol'], flierprops={'markerfacecolor':'purple','marker':'o'}
ax3.boxplot(maxVList)[3], labels=['Ceftamin'], flierprops={'markerfacecolor':'purple','marker':'o'}

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
# Plot the charts and apply some styling
## BASIC danger_drop, = plt.plot(time, danger_drop_speeds, color="indigo", alpha=(0.6), label="Danger Drop")
## BASIC railgun, = plt.plot(time, railgun_speeds, color="blue", label="RailGun")

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
merged_df.plot(kind='scatter', x='Tumor Volume (mm3)', y=)
## BASIC plt.scatter(x_axis, data, marker="o", facecolors="red", edgecolors="black", s=x_axis, alpha=0.75)

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
