# Pymaceuticals Inc.
---
#### Assignment Contents
This assignment is broken down into the following tasks:
Prepare the data // Generate summary statistics // Create bar charts and pie charts // Calculate quartiles, find outliers, and create a box plot // Create a line plot and a scatter plot // Calculate correlation and regression // Submit Final analysis

## Prepare the Data 

Necessary Requirements:
The datasets are merged into a single DataFrame. (6 points)
The number of mice are shown from the merged DataFrame. (2 points)
Each duplicate mice is found based on the Mouse ID and Timepoint. (6 points)
A clean DataFrame is created with the dropped duplicate mice. (4 points)
The number of mice are shown from the clean DataFrame. (2 points)

In [23]:
# Run the provided package dependency and data imports, and then merge the mouse_metadata and study_results DataFrames into a single DataFrame

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study/select data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results ## store into pandas dataframes
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
mouse_study_data_complete_df = pd.merge(study_results, mouse_metadata, how="left", on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
mouse_study_data_complete_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [26]:
mouse_study_data_complete_df.count() # all count is 1893... no missing information

Mouse ID              1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
dtype: int64

In [51]:
# Checking the number of mice

# Calculate # of total mice IDs in the data
total_Mouse_ID_count = mouse_study_data_complete_df["Mouse ID"].count() #total_Mouse_ID_count is 1893
# Calculate & Display # of unique mice IDs in the data
unique_Mouse_ID = mouse_study_data_complete_df["Mouse ID"].unique()
unique_Mouse_ID_count = mouse_study_data_complete_df["Mouse ID"].nunique()
unique_Mouse_ID_count

249

In [82]:
# check for any mouse ID with duplicate time points
#first, understand how many times unique Mouse ID should be repeated over duration (should be repeated 10 times)
unique_Timepoint_count = mouse_study_data_complete_df["Timepoint"].nunique() #counts how many unique time points
unique_Timepoints = mouse_study_data_complete_df["Timepoint"].unique() #determines the unique time points

#next, verify each uniwue Mouse ID is NOT repeated during any Timepoints
duplicate_Mouse_ID_Timepoint = mouse_study_data_complete_df[mouse_study_data_complete_df.duplicated(subset=["Mouse ID", "Timepoint"])]["Mouse ID"].unique()
duplicate_Mouse_ID_Timepoint

array(['g989'], dtype=object)

In [90]:
# Optional: Get all the data for the duplicate mouse ID (g989). 
duplicate_Mouse_ID_data = mouse_study_data_complete_df[(mouse_study_data_complete_df["Mouse ID"] == 'g989')]
duplicate_Mouse_ID_data # NOTE: this is not all the duplicated data... just all the data for the Mouse ID that has duplicate values
# notice: 10 occurances of Mouse ID g989 data duplication. need to remove 5 occurances (once each in first 5 timepoints respectively)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.786801,0,Propriva,Female,21,26
360,g989,5,47.570392,0,Propriva,Female,21,26
620,g989,10,51.745156,0,Propriva,Female,21,26
681,g989,10,49.880528,0,Propriva,Female,21,26
815,g989,15,51.325852,1,Propriva,Female,21,26
869,g989,15,53.44202,0,Propriva,Female,21,26
950,g989,20,55.326122,1,Propriva,Female,21,26
1111,g989,20,54.65765,1,Propriva,Female,21,26


In [105]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_mouse_study_data_complete_df = mouse_study_data_complete_df.drop_duplicates(subset = ['Mouse ID', 'Timepoint'], keep = 'first', inplace = False, ignore_index=True)
display(cleaned_mouse_study_data_complete_df)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1883,r944,45,41.581521,2,Capomulin,Male,12,25
1884,u364,45,31.023923,3,Capomulin,Male,18,17
1885,p438,45,61.433892,1,Ceftamin,Female,11,26
1886,x773,45,58.634971,4,Placebo,Female,21,30


In [107]:
# verify modifications were carried out correctly 
cleaned_mouse_study_data_complete_df.count() #should be 1888: original count (1893) - 5 because removed 5 occurances
cleaned_mouse_study_data_complete_df["Mouse ID"].nunique() #should be 249: original count (249)

249