In [1]:
# Add Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

In [2]:
# Create path to data files
mouse_metadata_path = "raw_data/mouse_metadata.csv"
study_results_path = "raw_data/study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [3]:
# Preview mouse_metadata
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [4]:
# Preview study_results
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [5]:
# Combine the data into a single dataset
capomulin_study_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")
capomulin_study_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [7]:
# Ensure capomulin_study_data is a clean dataset by checking for duplicate Timepoint values for each Mouse ID
duplicate_timepoints = capomulin_study_data[capomulin_study_data.duplicated(subset=["Mouse ID", "Timepoint"], keep=False)]

if not duplicate_timepoints.empty:
    print("Duplicate Timepoints found in capomulin_study_data")
    print(duplicate_timepoints)
else:
    print("No duplicate Timepoints found in capomulin_study_data")

Duplicate Timepoints found in capomulin_study_data
    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
908     g989     Propriva  Female          21          26          0   
909     g989     Propriva  Female          21          26          0   
910     g989     Propriva  Female          21          26          5   
911     g989     Propriva  Female          21          26          5   
912     g989     Propriva  Female          21          26         10   
913     g989     Propriva  Female          21          26         10   
914     g989     Propriva  Female          21          26         15   
915     g989     Propriva  Female          21          26         15   
916     g989     Propriva  Female          21          26         20   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
908           45.000000                 0  
909           45.000000                 0  
910           48.786801 

In [15]:
# Group duplicate_timepoints by Mouse ID and Timepoint taking the mean of the duplicate Tumor Volume (mm3) and Metastatic Sites values
group_duplicate_timepoints = duplicate_timepoints.groupby(["Mouse ID", "Timepoint"]).agg({"Tumor Volume (mm3)": "mean", "Metastatic Sites": "mean"}).reset_index()
print(group_duplicate_timepoints)

  Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0     g989          0           45.000000               0.0
1     g989          5           48.178597               0.0
2     g989         10           50.812842               0.0
3     g989         15           52.383936               0.5
4     g989         20           54.991886               1.0


In [21]:
# Mupltiply the Tumor Volume (mm3) and Metastatic Sites values by -1 in group_duplicate_timepoints
group_duplicate_timepoints["Tumor Volume (mm3)"] = group_duplicate_timepoints["Tumor Volume (mm3)"] * -1
group_duplicate_timepoints["Metastatic Sites"] = group_duplicate_timepoints["Metastatic Sites"] * -1
print(group_duplicate_timepoints)

  Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0     g989          0          -45.000000              -0.0
1     g989          5          -48.178597              -0.0
2     g989         10          -50.812842              -0.0
3     g989         15          -52.383936              -0.5
4     g989         20          -54.991886              -1.0


In [20]:
# Compare the group_duplicate_timepoints data with the fist instance of each Mouse ID and Timepoint in duplicate_timepoints
first_instance = duplicate_timepoints.drop_duplicates(subset=["Mouse ID", "Timepoint"], keep="first").drop(columns=["Drug Regimen", "Sex", "Age_months", "Weight (g)"])
print(first_instance)


    Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
908     g989          0           45.000000                 0
910     g989          5           48.786801                 0
912     g989         10           51.745156                 0
914     g989         15           51.325852                 1
916     g989         20           55.326122                 1


In [22]:
# Merge the first_instance data with the group_duplicate_timepoints data using the Mouse ID and Timepoint columns and sum the Tumor Volume (mm3) and Metastatic Sites values
duplicate_test = pd.merge(first_instance, group_duplicate_timepoints, on=["Mouse ID", "Timepoint"], suffixes=("_first", "_mean"))
duplicate_test["Tumor Volume (mm3)"] = duplicate_test["Tumor Volume (mm3)_first"] + duplicate_test["Tumor Volume (mm3)_mean"]
duplicate_test["Metastatic Sites"] = duplicate_test["Metastatic Sites_first"] + duplicate_test["Metastatic Sites_mean"]
duplicate_test = duplicate_test.drop(columns=["Tumor Volume (mm3)_first", "Metastatic Sites_first", "Tumor Volume (mm3)_mean", "Metastatic Sites_mean"])
print(duplicate_test)


  Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0     g989          0            0.000000               0.0
1     g989          5            0.608205               0.0
2     g989         10            0.932314               0.0
3     g989         15           -1.058084               0.5
4     g989         20            0.334236               0.0


In [23]:
# Display any Mouse ID in which the Tumor Volume (mm3) or Metastatic Sites values are not equal to 0
duplicate_test = duplicate_test[(duplicate_test["Tumor Volume (mm3)"] != 0) | (duplicate_test["Metastatic Sites"] != 0)]
print(duplicate_test)


  Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
1     g989          5            0.608205               0.0
2     g989         10            0.932314               0.0
3     g989         15           -1.058084               0.5
4     g989         20            0.334236               0.0


In [24]:
# List unique Mouse ID values in duplicate_test
duplicate_mice = duplicate_test["Mouse ID"].unique()
print(duplicate_mice)

['g989']


In [25]:
# Drop the Mouse ID values in duplicate_mice from capomulin_study_data and reset the index
capomulin_study_data_clean = capomulin_study_data[~capomulin_study_data["Mouse ID"].isin(duplicate_mice)].reset_index(drop=True)
capomulin_study_data_clean.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [None]:
# If there were duplicate enetries that were double entries (All data was equal for every double entry) in the capomulin_study_data dataset. This is where I would drop the duplicates and keep the first entry.

In [26]:
# Ensure capomulin_study_data_clean is a clean dataset by checking for duplicate Timepoint values for each Mouse ID again
duplicate_timepoints_2 = capomulin_study_data_clean[capomulin_study_data_clean.duplicated(subset=["Mouse ID", "Timepoint"], keep=False)]

if not duplicate_timepoints_2.empty:
    print("Duplicate Timepoints found in capomulin_study_data_clean")
    print(duplicate_timepoints_2)
else:
    print("No duplicate Timepoints found in capomulin_study_data_clean")

No duplicate Timepoints found in capomulin_study_data_clean
