## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
# Combine the data into a single dataset

#Need to look at the files so I know how to combine them
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [3]:
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [4]:
# Display the data table for preview
mice_tumor_all_df= pd.merge(mouse_metadata,study_results, how= 'outer', on= 'Mouse ID' )
mice_tumor_all_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Checking the number of mice.
mice_count = len(mice_tumor_all_df["Mouse ID"].unique())
mice_count

249

In [6]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Optional: Get all the data for the duplicate mouse ID. 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Checking the number of mice in the clean DataFrame.

In [7]:
#Mouse ID no duplicates
mice_ID_list = mice_tumor_all_df["Mouse ID"].unique()
mice_ID_list

array(['k403', 's185', 'x401', 'm601', 'g791', 's508', 'f966', 'm546',
       'z578', 'j913', 'u364', 'n364', 'y793', 'r554', 'm957', 'c758',
       't565', 'a644', 'i177', 'j989', 'i738', 'a520', 'w914', 'r811',
       'g288', 'i334', 'q610', 'd251', 'l897', 'c458', 'b742', 'b128',
       'j246', 'a411', 'j119', 'w150', 'v923', 'g316', 's710', 'l509',
       'r944', 'e662', 'u196', 'q597', 'a444', 'i557', 'r921', 'w678',
       'y449', 'a203', 'a251', 'a262', 'a275', 'a366', 'a401', 'a457',
       'a492', 'a577', 'a685', 'a699', 'a788', 'a818', 'a897', 'a963',
       'b313', 'b447', 'b487', 'b559', 'b759', 'b879', 'c139', 'c264',
       'c282', 'c302', 'c326', 'c402', 'c559', 'c580', 'c757', 'c766',
       'c819', 'c832', 'c895', 'c927', 'd133', 'd164', 'd474', 'e213',
       'e227', 'e291', 'e476', 'e584', 'f129', 'f234', 'f278', 'f345',
       'f394', 'f436', 'f545', 'f932', 'f993', 'g107', 'g296', 'g497',
       'g558', 'g570', 'g867', 'g989', 'h246', 'h333', 'h428', 'h531',
      

In [8]:
#this is a CLEAN dataframe to use- has only 1 count per 'Mouse ID'
mice_tumor_df= mice_tumor_all_df.groupby('Mouse ID').head(1)
mice_tumor_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
10,s185,Capomulin,Female,3,17,0,45.0,0
20,x401,Capomulin,Female,16,15,0,45.0,0
30,m601,Capomulin,Male,22,17,0,45.0,0
40,g791,Ramicane,Male,11,16,0,45.0,0
...,...,...,...,...,...,...,...,...
1858,z314,Stelasyn,Female,21,28,0,45.0,0
1860,z435,Propriva,Female,12,26,0,45.0,0
1863,z581,Infubinol,Female,24,25,0,45.0,0
1873,z795,Naftisol,Female,13,29,0,45.0,0


In [9]:
mice_tumor_all_df["Mouse_Count"] = pd.DataFrame(mice_tumor_all_df.groupby('Mouse ID').cumcount())
mice_tumor_all_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
0,k403,Ramicane,Male,21,16,0,45.000000,0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0,1
2,k403,Ramicane,Male,21,16,10,35.014271,1,2
3,k403,Ramicane,Male,21,16,15,34.223992,1,3
4,k403,Ramicane,Male,21,16,20,32.997729,1,4
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,5
1889,z969,Naftisol,Male,9,30,30,65.841013,3,6
1890,z969,Naftisol,Male,9,30,35,69.176246,4,7
1891,z969,Naftisol,Male,9,30,40,70.314904,4,8


In [10]:
mice_tumor_not_first_df = mice_tumor_all_df.groupby('Mouse_Count')
#this tells me how many have extra data
mice_tumor_not_first_df.filter(lambda x: (x['Mouse_Count'] >0).any())
mice_tumor_not_first_df.count()

Unnamed: 0_level_0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Mouse_Count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,249,249,249,249,249,249,249,249
1,237,237,237,237,237,237,237,237
2,222,222,222,222,222,222,222,222
3,206,206,206,206,206,206,206,206
4,194,194,194,194,194,194,194,194
5,183,183,183,183,183,183,183,183
6,171,171,171,171,171,171,171,171
7,154,154,154,154,154,154,154,154
8,143,143,143,143,143,143,143,143
9,131,131,131,131,131,131,131,131


In [11]:
#mice_tumor_not_first_df.filter(lambda x: (x['Mouse Count'] >0).any())
mice_tumor_all_df.groupby('Mouse_Count').count()[['Mouse ID']]


Unnamed: 0_level_0,Mouse ID
Mouse_Count,Unnamed: 1_level_1
0,249
1,237
2,222
3,206
4,194
5,183
6,171
7,154
8,143
9,131


In [12]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Optional: Get all the data for the duplicate mouse ID. 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Checking the number of mice in the clean DataFrame.

In [13]:
#Mouse ID new and Timepoint '0'
mice_tumor_timepoint_0=pd.DataFrame(mice_tumor_all_df[mice_tumor_all_df.Timepoint == 0])
mice_tumor_timepoint_0

#250 count

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
0,k403,Ramicane,Male,21,16,0,45.0,0,0
10,s185,Capomulin,Female,3,17,0,45.0,0,0
20,x401,Capomulin,Female,16,15,0,45.0,0,0
30,m601,Capomulin,Male,22,17,0,45.0,0,0
40,g791,Ramicane,Male,11,16,0,45.0,0,0
...,...,...,...,...,...,...,...,...,...
1858,z314,Stelasyn,Female,21,28,0,45.0,0,0
1860,z435,Propriva,Female,12,26,0,45.0,0,0
1863,z581,Infubinol,Female,24,25,0,45.0,0,0
1873,z795,Naftisol,Female,13,29,0,45.0,0,0


In [14]:
mice_tumor_all_df[mice_tumor_all_df.Mouse_Count == 0]
# This gives me 249 count

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
0,k403,Ramicane,Male,21,16,0,45.0,0,0
10,s185,Capomulin,Female,3,17,0,45.0,0,0
20,x401,Capomulin,Female,16,15,0,45.0,0,0
30,m601,Capomulin,Male,22,17,0,45.0,0,0
40,g791,Ramicane,Male,11,16,0,45.0,0,0
...,...,...,...,...,...,...,...,...,...
1858,z314,Stelasyn,Female,21,28,0,45.0,0,0
1860,z435,Propriva,Female,12,26,0,45.0,0,0
1863,z581,Infubinol,Female,24,25,0,45.0,0,0
1873,z795,Naftisol,Female,13,29,0,45.0,0,0


In [15]:
mice_tumor_timepoint_count=pd.DataFrame(mice_tumor_timepoint_0[mice_tumor_timepoint_0.Mouse_Count == 1])
mice_tumor_timepoint_count
#find the odd ball tha has two time points of '0'

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
909,g989,Propriva,Female,21,26,0,45.0,0,1


In [16]:
# FOUND YOU, -DUplicate Data- now want to see you
mice_tumor_timepoint_count2=pd.DataFrame(mice_tumor_timepoint_0[mice_tumor_timepoint_0['Mouse ID'] == 'g989'])
mice_tumor_timepoint_count2

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
908,g989,Propriva,Female,21,26,0,45.0,0,0
909,g989,Propriva,Female,21,26,0,45.0,0,1


In [17]:
mice_tumor_timepoint_not0=pd.DataFrame(mice_tumor_all_df[mice_tumor_all_df.Timepoint != 0])
mice_tumor_timepoint_not0
#1643 + 249 + 1(duplicate '0' g898)=1893 
#Gives all of the 'other' data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Mouse_Count
1,k403,Ramicane,Male,21,16,5,38.825898,0,1
2,k403,Ramicane,Male,21,16,10,35.014271,1,2
3,k403,Ramicane,Male,21,16,15,34.223992,1,3
4,k403,Ramicane,Male,21,16,20,32.997729,1,4
5,k403,Ramicane,Male,21,16,25,33.464577,1,5
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,5
1889,z969,Naftisol,Male,9,30,30,65.841013,3,6
1890,z969,Naftisol,Male,9,30,35,69.176246,4,7
1891,z969,Naftisol,Male,9,30,40,70.314904,4,8


## Summary Statistics

In [18]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.


#first find out how many different counts are in Drug Regimen.
drug_names = mice_tumor_df["Drug Regimen"].unique()
drug_names

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [19]:
#TEST -Just to see what describe() will output
mice_tumor_df["Drug Regimen"].describe()

count          249
unique          10
top       Ketapril
freq            25
Name: Drug Regimen, dtype: object

In [20]:
 #tumor_size_array = mice_tumor_df.groupby('Drug Regimen').index.get_level_values('Tumor Volume (mm3)')
 #tumor_size_array   
    

In [21]:
#How to pull out a specific set of information!!!

tumor_size_array = pd.DataFrame(mice_tumor_df[mice_tumor_df['Drug Regimen'] == "Placebo"])
tumor_size_array

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
474,a262,Placebo,Female,17,29,0,45.0,0
568,a897,Placebo,Male,7,28,0,45.0,0
648,c282,Placebo,Male,12,27,0,45.0,0
697,c757,Placebo,Male,9,27,0,45.0,0
707,c766,Placebo,Female,13,26,0,45.0,0
775,e227,Placebo,Male,1,30,0,45.0,0
950,i477,Placebo,Female,3,30,0,45.0,0
963,i669,Placebo,Female,18,27,0,45.0,0
979,j235,Placebo,Male,6,30,0,45.0,0
1163,l872,Placebo,Male,19,30,0,45.0,0


In [22]:
tumorsize= tumor_size_array["Tumor Volume (mm3)"]
tumorsize

474     45.0
568     45.0
648     45.0
697     45.0
707     45.0
775     45.0
950     45.0
963     45.0
979     45.0
1163    45.0
1229    45.0
1261    45.0
1295    45.0
1417    45.0
1435    45.0
1476    45.0
1493    45.0
1499    45.0
1552    45.0
1572    45.0
1617    45.0
1646    45.0
1671    45.0
1778    45.0
1824    45.0
Name: Tumor Volume (mm3), dtype: float64

In [23]:
tumor_size_data_df = pd.DataFrame([[0],
                                      [0],
                                      [0],
                                      [0],
                                      [0]],
                                  index= ['Mean', 'Median', 'Variance', 'St Dev', 'SEM'],
                                  columns=[]) 
tumor_size_data_df

Mean
Median
Variance
St Dev
SEM


In [24]:
# mean, median, variance, standard deviation, and SEM of the tumor volume. 



for x in drug_names:
    tumor_size_array = pd.DataFrame(mice_tumor_df[mice_tumor_df['Drug Regimen'] == x])
    tumorsize= tumor_size_array["Tumor Volume (mm3)"]
    
    mean_tumor = np.mean(tumorsize)
    median_tumor = np.median(tumorsize)
    var_tumor = np.var(tumorsize)
    st_tumor = np.std(tumorsize)
    sem_tumor = sem(tumorsize)
    
    tumor_size_data_df[x] = pd.DataFrame([[mean_tumor],
                                          [median_tumor],
                                          [var_tumor],
                                          [st_tumor],
                                          [sem_tumor]],
                                        index= ['Mean', 'Median', 'Variance', 'St Dev', 'SEM'],
                                        columns=[x])

#used to test my calculations    
#print(f"The mean temperature at the LAX airport is {sem_tumor}")
tumor_size_data_df

Unnamed: 0,Ramicane,Capomulin,Infubinol,Placebo,Ceftamin,Stelasyn,Zoniferol,Ketapril,Propriva,Naftisol
Mean,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0
Median,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0
Variance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
St Dev,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
tumor_size_all_data_df = pd.DataFrame([[0],
                                      [0],
                                      [0],
                                      [0],
                                      [0]],
                                  index= ['Mean', 'Median', 'Variance', 'St Dev', 'SEM'],
                                  columns=[]) 
tumor_size_all_data_df

Mean
Median
Variance
St Dev
SEM


In [29]:
# mean, median, variance, standard deviation, and SEM of the tumor volume. 

#This DF is for ALL data


for x in drug_names:
    tumor_size_array = pd.DataFrame(mice_tumor_all_df[mice_tumor_all_df['Drug Regimen'] == x])
    tumorsize= tumor_size_array["Tumor Volume (mm3)"]
    
    mean_tumor = round(np.mean(tumorsize),2)
    median_tumor = round(np.median(tumorsize),2)
    var_tumor = round(np.var(tumorsize),2)
    st_tumor = round(np.std(tumorsize),2)
    sem_tumor = round(sem(tumorsize),2)
    
    tumor_size_all_data_df[x] = pd.DataFrame([[mean_tumor],
                                          [median_tumor],
                                          [var_tumor],
                                          [st_tumor],
                                          [sem_tumor]],
                                        index= ['Mean', 'Median', 'Variance', 'St Dev', 'SEM'],
                                        columns=[x])

#used to test my calculations    
#print(f"The mean temperature at the LAX airport is {sem_tumor}")
tumor_size_all_data_df

Unnamed: 0,Ramicane,Capomulin,Infubinol,Placebo,Ceftamin,Stelasyn,Zoniferol,Ketapril,Propriva,Naftisol
Mean,40.22,40.68,52.88,54.03,52.59,54.23,53.24,55.24,52.32,54.33
Median,40.67,41.56,51.82,52.29,51.78,52.43,51.82,53.7,50.85,52.51
Variance,23.38,24.84,42.89,60.83,39.07,59.12,48.27,68.19,42.09,65.82
St Dev,4.84,4.98,6.55,7.8,6.25,7.69,6.95,8.26,6.49,8.11
SEM,0.32,0.33,0.49,0.58,0.47,0.57,0.52,0.6,0.51,0.6


IndentationError: unexpected indent (<ipython-input-28-aa5e8586df68>, line 6)

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
