## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
mouse_metadata.head(1)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16


In [3]:
study_results.head(1)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0


In [4]:
# Combine the data into a single dataset
merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
merged_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [5]:
# Checking the number of mice.
subjectCt = (merged_df["Mouse ID"]).nunique()
print(subjectCt)

249


In [6]:
## C/O Sanoo Singh

In [7]:
duplicate_mice = merged_df.loc[merged_df.duplicated(subset=['Mouse ID', 'Timepoint',]),'Mouse ID']

In [8]:
duplicates = merged_df.loc[merged_df.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False),'Mouse ID']

In [9]:
print(duplicates)

908    g989
909    g989
910    g989
911    g989
912    g989
913    g989
914    g989
915    g989
916    g989
917    g989
Name: Mouse ID, dtype: object


## <span style="color: black;">   Fig 1.1  All Data for Duplicate Mouse ID

In [10]:
#List all data for duplicate mice by ID
merged_df.loc[merged_df['Mouse ID'] == 'g989', :]

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [11]:
## Assign Variables to each relevant Series in the DataFrame
tPt = merged_df['Timepoint']
tVol = merged_df['Tumor Volume (mm3)']
sex = merged_df['Sex']
dReg = merged_df['Drug Regimen']
wt = merged_df['Weight (g)']

## Assign Treatments to a List
scrip_list = merged_df['Drug Regimen'].unique()
scrip_list

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [12]:
# Drop all the rows for the mouse with duplicates
# merged_df.drop(index=[908 to 920], inplace = True)
merged_df.drop(index=[908,909,910,911,912,913,914,915,916,917,918,919,920], inplace=True)

In [13]:
# Check the number of mice in the cleaned up DataFrame.
subjectCt = (merged_df["Mouse ID"]).nunique()
print(subjectCt)

248


In [14]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
drug_reg = merged_df.groupby(['Drug Regimen'])
avgVol = drug_reg[['Tumor Volume (mm3)']].mean()
medVol = drug_reg[['Tumor Volume (mm3)']].median()
varVol = drug_reg[['Tumor Volume (mm3)']].var()
stdVol = drug_reg[['Tumor Volume (mm3)']].std()
semVol = drug_reg[['Tumor Volume (mm3)']].sem()

## <span style="color: black;">Average Tumor Volume Summary

In [None]:
avgVol.head(10)

## <span style="color: black;">Median Tumor Volume Summary

In [None]:
medVol.head(10)

## <span style="color: black;">Variance Tumor Volume Summary

In [None]:
varVol.head(10)

## <span style="color: black;">Standard Deviation Tumor Volume Summary

In [None]:
stdVol.head(11)

## <span style="color: black;">Standard Error of the Mean Tumor Volume Summary

In [None]:
semVol.head(11)

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
## ??   summary_table = drug_reg.aggregate(func='mean', 'median', 'var', 'std', 'sem'), ['Tumor Volume (mm3)']

# <span style="color: blue;">CURRENTLY</span> <span style="color: green;">WORKING</span> <span style="color: red;">HERE</span>

## Bar and Pie Charts

In [25]:
drug_reg.plot(kind='bar', x=(drug_reg['Drug Regimen']), y=(drug_reg['Timepoint']).count(), figsize=(8, 11), title="Number of Timepoints")

  result = np.asarray(values, dtype=dtype)
  result = np.asarray(values, dtype=dtype)


TypeError: unhashable type: 'Series'

In [None]:
(merged_df['Timepoint']).value_counts().plot(kind='bar')


In [None]:
# Set Bar Data Parameters
#total_tPts = (tPt).count
x_axis = np.arange(len(drug_reg['Timepoint']))
tick_loc = [value for value in x_axis]

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
## BASIC -- plt.bar(x_axis, users, color='r', alpha=0.5, align="center")
# FAIL plt.bar(scrip_list, tPt, color="indigo", alpha=(0.6), align="center")
plt.figure(figsize = (11, 8))
plt.bar(x_axis, drug_reg['Timepoint'], color='indigo', alpha=0.6, align='center')
plt.xticks(tick_loc, drug_reg['Regimen'], rotation='vertical')

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
total_tPts = (tPt).count
x_axis = np.arange(len(total_tPts))

plt.bar(x_axis, total_tPts, color="indigo", alpha=(0.6), align="center")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
merged_df["Sex"].value_counts().plot(kind='pie')
## BASIC plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## BASIC plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
# Plot the charts and apply some styling
## BASIC danger_drop, = plt.plot(time, danger_drop_speeds, color="indigo", alpha=(0.6), label="Danger Drop")
## BASIC railgun, = plt.plot(time, railgun_speeds, color="blue", label="RailGun")

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
## BASIC plt.scatter(x_axis, data, marker="o", facecolors="red", edgecolors="black", s=x_axis, alpha=0.75)

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
