## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
pd.set_option("display.precision", 2)

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_data = pd.DataFrame.merge(mouse_metadata, study_results)

#simplify name
df = merged_data

# Display the data table for preview
df.head()

#check value counts
df['Mouse ID'].value_counts()

# looks like g89 has 3 duplicated values


g989    13
x402    10
q633    10
h246    10
a963    10
        ..
h428     1
f932     1
v199     1
l872     1
d133     1
Name: Mouse ID, Length: 249, dtype: int64

In [2]:
# count number of unique mice

df_uniq = df.sort_values('Mouse ID')
df_uniq = df_uniq.drop_duplicates(subset='Mouse ID', keep='first')

num_unique_mice = df_uniq['Mouse ID'].count()

print (f"There were {num_unique_mice} mice used in this experiment")

There were 249 mice used in this experiment


In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# actually the vaue_counts number above showed which mic were duplicate. 
#I had done this anlysis before realizing that so left i in.

df_sorted = df.sort_values(by=['Mouse ID', 'Timepoint'])

#identify duplicated rows
dupes = df_sorted[df_sorted.duplicated(['Mouse ID', 'Timepoint'])]

#after looking at the above results, g989 had duplicated data. (as shown in values.count result)
duplicated_mouse_data = df[df['Mouse ID'] == 'g989']

#removed all rows for mouse g989.  Not sure if we should just remove the duplicated rows,
#but instructions seemd to say get rid of all data for that mouse
filtered_df = df[df['Mouse ID'] != 'g989']

#see how many unique mie are now in the expriment

new_df_uniq = filtered_df.drop_duplicates(subset='Mouse ID', keep='first')

new_unique_mice = new_df_uniq['Mouse ID'].count()

#check that the right number of rows, and only one unique mouse were removed

original_num = df['Mouse ID'].value_counts().sum()   
num_dups = dupes['Mouse ID'].value_counts().sum()
num_dup_rows = duplicated_mouse_data['Mouse ID'].value_counts().sum()
filtered_num = filtered_df['Mouse ID'].value_counts().sum()

print (f"There were originally {num_unique_mice} mice used in this experiment")
print (f"After dups removed there are {new_unique_mice} mice represented in the data")

print(f"original total of observations = {original_num}")
print(f"number duplicated rows identified = {num_dups}")
print(f"number of rows from the problem mouse (g989) = {num_dup_rows}")
print(f"number of observations remaining after g989 removed = {filtered_num}")


There were originally 249 mice used in this experiment
After dups removed there are 248 mice represented in the data
original total of observations = 1893
number duplicated rows identified = 5
number of rows from the problem mouse (g989) = 13
number of observations remaining after g989 removed = 1880


## Summary Statistics

In [27]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
#and SEM of the tumor volume for each regimen

clean = filtered_df
drugs = clean['Drug Regimen'].value_counts()

clean_uniq = clean.sort_values('Drug Regimen')
drugs = clean_uniq.drop_duplicates(subset='Drug Regimen', keep='first')

#make a list of drugs
reg = [drug for drug in drugs['Drug Regimen']]

#group data frame by drug regimen 
gby_regimen = clean.groupby('Drug Regimen')

#make a list of each decriptive statistic
means = [stat for stat in gby_regimen['Tumor Volume (mm3)'].mean()]
medians = [stat for stat in gby_regimen['Tumor Volume (mm3)'].median()]
stds = [stat for stat in gby_regimen['Tumor Volume (mm3)'].std()]
vars = [stat for stat in gby_regimen['Tumor Volume (mm3)'].var()]
sems = [stat for stat in gby_regimen['Tumor Volume (mm3)'].sem()]

# Make a dictionary with appropriate column names and corresponding list of values
stats_dict = {'Drug': reg, 
              'Mean':means,
              'Median': medians, 
              'St. Dev':stds, 
              'Variance': vars,
              'SEM': sems}

stats_dict_df = pd.DataFrame(stats_dict)
stats_dict_df.style.set_caption("Descriptive Statistics for Tumor Volume Following Various Drug Treatments")

    

Unnamed: 0,Drug,Mean,Median,St. Dev,Variance,SEM
0,Capomulin,40.68,41.56,4.99,24.95,0.33
1,Ceftamin,52.59,51.78,6.27,39.29,0.47
2,Infubinol,52.88,51.82,6.57,43.13,0.49
3,Ketapril,55.24,53.7,8.28,68.55,0.6
4,Naftisol,54.33,52.51,8.13,66.17,0.6
5,Placebo,54.03,52.29,7.82,61.17,0.58
6,Propriva,52.32,50.45,6.62,43.85,0.54
7,Ramicane,40.22,40.67,4.85,23.49,0.32
8,Stelasyn,54.23,52.43,7.71,59.45,0.57
9,Zoniferol,53.24,51.82,6.97,48.53,0.52


In [5]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen




In [19]:
# Using the aggregation method, produce the same summary statistics in a single line
gby_regimen.agg({'Tumor Volume (mm3)': ['mean','median', 'std', 'var', 'sem']})

# I found this method on my own after many googles, then read the instructions.  I'm seeing a pattern.



Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,std,var,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.68,41.56,4.99,24.95,0.33
Ceftamin,52.59,51.78,6.27,39.29,0.47
Infubinol,52.88,51.82,6.57,43.13,0.49
Ketapril,55.24,53.7,8.28,68.55,0.6
Naftisol,54.33,52.51,8.13,66.17,0.6
Placebo,54.03,52.29,7.82,61.17,0.58
Propriva,52.32,50.45,6.62,43.85,0.54
Ramicane,40.22,40.67,4.85,23.49,0.32
Stelasyn,54.23,52.43,7.71,59.45,0.57
Zoniferol,53.24,51.82,6.97,48.53,0.52


## Bar and Pie Charts

In [6]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [7]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [8]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [9]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [10]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [11]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [12]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [13]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [14]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [15]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
