In [63]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_df = pd.merge(mouse_metadata, study_results, on='Mouse ID')
# Display the data table for preview
mouse_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [64]:
# Inspect count and data type of columns
mouse_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Mouse ID            1893 non-null   object 
 1   Drug Regimen        1893 non-null   object 
 2   Sex                 1893 non-null   object 
 3   Age_months          1893 non-null   int64  
 4   Weight (g)          1893 non-null   int64  
 5   Timepoint           1893 non-null   int64  
 6   Tumor Volume (mm3)  1893 non-null   float64
 7   Metastatic Sites    1893 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 133.1+ KB


In [65]:
# Checking the number of mice.
num_mice = mouse_df['Mouse ID'].nunique()
num_mice

249

In [66]:
# Check for duplicate mouse 
duplicate = mouse_df.duplicated().value_counts()
duplicate

False    1892
True        1
dtype: int64

In [67]:
# Extract rows with duplicate mouse information
duplicated = mouse_df[mouse_df.duplicated()]
duplicated

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0


In [71]:
# Drop duplicate mouse
mouse_df = mouse_df.drop_duplicates(keep='first')

# Confirm duplicate mouse is dropped
duplicates_new = mouse_df.duplicated().value_counts()
duplicates_new

False    1892
dtype: int64

In [72]:
# Re-Inspect data type and count of column values
mouse_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1892 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Mouse ID            1892 non-null   object 
 1   Drug Regimen        1892 non-null   object 
 2   Sex                 1892 non-null   object 
 3   Age_months          1892 non-null   int64  
 4   Weight (g)          1892 non-null   int64  
 5   Timepoint           1892 non-null   int64  
 6   Tumor Volume (mm3)  1892 non-null   float64
 7   Metastatic Sites    1892 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 133.0+ KB


In [74]:
# Summary Statistics


# Mean Tumor Volume
#tumor_volume_mean = mouse_df['Tumor Volume (mm3)'].mean()

# Median Tumor Volume
#tumor_volume_median = mouse_df['Tumor Volume (mm3)'].median()

# 

#tumor_volume_mean, tumor_volume_median

(50.45126032511628, 48.95469674)

In [75]:
# Create a groupby object of drug regimen
drug_groupby = mouse_df.groupby('Drug Regimen')

# Mean Tumor Volume for each drug regimen
tumor_volume_mean = drug_groupby['Tumor Volume (mm3)'].mean()

# Median Tumor Volume for each drug regimen
tumor_volume_median = drug_groupby['Tumor Volume (mm3)'].median()

# Variance of Tumor Volume for each drug regimen
tumor_volume_variance = drug_groupby['Tumor Volume (mm3)'].var()
tumor_volume_mean, tumor_volume_median, tumor_volume_variance

Drug Regimen
Capomulin    40.675741
Ceftamin     52.591172
Infubinol    52.884795
Ketapril     55.235638
Naftisol     54.331565
Placebo      54.033581
Propriva     52.368318
Ramicane     40.216745
Stelasyn     54.233149
Zoniferol    53.236507
Name: Tumor Volume (mm3), dtype: float64