## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
# Combine the data into a single dataset

#Need to look at the files so I know how to combine them
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [3]:
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [4]:
# Display the data table for preview
mice_tumor_all_df= pd.merge(mouse_metadata,study_results, how= 'outer', on= 'Mouse ID' )
mice_tumor_all_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Checking the number of mice.
mice_count = len(mice_tumor_all_df["Mouse ID"].unique())
mice_count

249

In [6]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Optional: Get all the data for the duplicate mouse ID. 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Checking the number of mice in the clean DataFrame.

In [7]:
#this is a CLEAN dataframe to use- has only 1 count per 'Mouse ID'
mice_tumor_df= mice_tumor_all_df.groupby('Mouse ID').head(1)
mice_tumor_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
10,s185,Capomulin,Female,3,17,0,45.0,0
20,x401,Capomulin,Female,16,15,0,45.0,0
30,m601,Capomulin,Male,22,17,0,45.0,0
40,g791,Ramicane,Male,11,16,0,45.0,0
...,...,...,...,...,...,...,...,...
1858,z314,Stelasyn,Female,21,28,0,45.0,0
1860,z435,Propriva,Female,12,26,0,45.0,0
1863,z581,Infubinol,Female,24,25,0,45.0,0
1873,z795,Naftisol,Female,13,29,0,45.0,0


In [8]:
#seeing how many have 2 timepoints
mice_tumor_2_df= mice_tumor_df.groupby('Mouse ID').head(2)
mice_tumor_2_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
10,s185,Capomulin,Female,3,17,0,45.0,0
20,x401,Capomulin,Female,16,15,0,45.0,0
30,m601,Capomulin,Male,22,17,0,45.0,0
40,g791,Ramicane,Male,11,16,0,45.0,0
...,...,...,...,...,...,...,...,...
1858,z314,Stelasyn,Female,21,28,0,45.0,0
1860,z435,Propriva,Female,12,26,0,45.0,0
1863,z581,Infubinol,Female,24,25,0,45.0,0
1873,z795,Naftisol,Female,13,29,0,45.0,0


In [9]:
mice_tumor_all_df["Mouse_Count"] = pd.DataFrame(mice_tumor_all_df.groupby('Mouse ID').cumcount())
mice_tumor_all_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
0,k403,Ramicane,Male,21,16,0,45.000000,0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0,1
2,k403,Ramicane,Male,21,16,10,35.014271,1,2
3,k403,Ramicane,Male,21,16,15,34.223992,1,3
4,k403,Ramicane,Male,21,16,20,32.997729,1,4
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,5
1889,z969,Naftisol,Male,9,30,30,65.841013,3,6
1890,z969,Naftisol,Male,9,30,35,69.176246,4,7
1891,z969,Naftisol,Male,9,30,40,70.314904,4,8


In [10]:
mice_tumor_not_first_df = mice_tumor_all_df.groupby('Mouse_Count')
#this tells me how many have extra data
mice_tumor_not_first_df.filter(lambda x: (x['Mouse_Count'] >0).any())
mice_tumor_not_first_df.count()

Unnamed: 0_level_0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Mouse_Count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,249,249,249,249,249,249,249,249
1,237,237,237,237,237,237,237,237
2,222,222,222,222,222,222,222,222
3,206,206,206,206,206,206,206,206
4,194,194,194,194,194,194,194,194
5,183,183,183,183,183,183,183,183
6,171,171,171,171,171,171,171,171
7,154,154,154,154,154,154,154,154
8,143,143,143,143,143,143,143,143
9,131,131,131,131,131,131,131,131


In [11]:
#mice_tumor_not_first_df.filter(lambda x: (x['Mouse Count'] >0).any())
mice_tumor_all_df.groupby('Mouse_Count').count()[['Mouse ID']]


Unnamed: 0_level_0,Mouse ID
Mouse_Count,Unnamed: 1_level_1
0,249
1,237
2,222
3,206
4,194
5,183
6,171
7,154
8,143
9,131


In [12]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Optional: Get all the data for the duplicate mouse ID. 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Checking the number of mice in the clean DataFrame.

In [19]:
#Mouse ID new and Timepoint '0'
mice_tumor_timepoint_0=pd.DataFrame(mice_tumor_all_df[mice_tumor_all_df.Timepoint == 0])
mice_tumor_timepoint_0

#250 count

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
0,k403,Ramicane,Male,21,16,0,45.0,0,0
10,s185,Capomulin,Female,3,17,0,45.0,0,0
20,x401,Capomulin,Female,16,15,0,45.0,0,0
30,m601,Capomulin,Male,22,17,0,45.0,0,0
40,g791,Ramicane,Male,11,16,0,45.0,0,0
...,...,...,...,...,...,...,...,...,...
1858,z314,Stelasyn,Female,21,28,0,45.0,0,0
1860,z435,Propriva,Female,12,26,0,45.0,0,0
1863,z581,Infubinol,Female,24,25,0,45.0,0,0
1873,z795,Naftisol,Female,13,29,0,45.0,0,0


In [17]:
mice_tumor_all_df[mice_tumor_all_df.Mouse_Count == 0]
# This gives me 249 count

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
0,k403,Ramicane,Male,21,16,0,45.0,0,0
10,s185,Capomulin,Female,3,17,0,45.0,0,0
20,x401,Capomulin,Female,16,15,0,45.0,0,0
30,m601,Capomulin,Male,22,17,0,45.0,0,0
40,g791,Ramicane,Male,11,16,0,45.0,0,0
...,...,...,...,...,...,...,...,...,...
1858,z314,Stelasyn,Female,21,28,0,45.0,0,0
1860,z435,Propriva,Female,12,26,0,45.0,0,0
1863,z581,Infubinol,Female,24,25,0,45.0,0,0
1873,z795,Naftisol,Female,13,29,0,45.0,0,0


In [21]:
mice_tumor_timepoint_count=pd.DataFrame(mice_tumor_timepoint_0[mice_tumor_timepoint_0.Mouse_Count == 1])
mice_tumor_timepoint_count
#find the odd ball tha has two time points of '0'

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
909,g989,Propriva,Female,21,26,0,45.0,0,1


In [24]:
# FOUND you, now want to see you
mice_tumor_timepoint_count2=pd.DataFrame(mice_tumor_timepoint_0[mice_tumor_timepoint_0['Mouse ID'] == 'g989'])
mice_tumor_timepoint_count2

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
908,g989,Propriva,Female,21,26,0,45.0,0,0
909,g989,Propriva,Female,21,26,0,45.0,0,1


In [25]:
mice_tumor_timepoint_not0=pd.DataFrame(mice_tumor_all_df[mice_tumor_all_df.Timepoint != 0])
mice_tumor_timepoint_not0
#1643 + 249 + 1(duplicate '0' g898)=1893 
#Gives all of the 'other' data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
1,k403,Ramicane,Male,21,16,5,38.825898,0,1
2,k403,Ramicane,Male,21,16,10,35.014271,1,2
3,k403,Ramicane,Male,21,16,15,34.223992,1,3
4,k403,Ramicane,Male,21,16,20,32.997729,1,4
5,k403,Ramicane,Male,21,16,25,33.464577,1,5
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,5
1889,z969,Naftisol,Male,9,30,30,65.841013,3,6
1890,z969,Naftisol,Male,9,30,35,69.176246,4,7
1891,z969,Naftisol,Male,9,30,40,70.314904,4,8


## Summary Statistics

In [26]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.


#first find out how many different counts are in Drug Regimen.
mice_tumor_df["Drug Regimen"].unique()

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [27]:
#TEST -Just to see what describe() will output
mice_tumor_df.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,249.0,249.0,249.0,249.0,249.0
mean,12.730924,26.116466,0.0,45.0,0.0
std,7.228687,3.697003,0.0,0.0,0.0
min,1.0,15.0,0.0,45.0,0.0
25%,6.0,25.0,0.0,45.0,0.0
50%,13.0,27.0,0.0,45.0,0.0
75%,19.0,29.0,0.0,45.0,0.0
max,24.0,30.0,0.0,45.0,0.0


In [None]:
def 

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
