## Observations and Insights 

In [121]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
mouse_df = pd.DataFrame(mouse_metadata)
mouse_df.head()
study_results = pd.read_csv(study_results_path)
results_df =pd.DataFrame(study_results)
results_df.head()

#Combine the data into a single dataset
merged_df = pd.merge(left=mouse_df, right=results_df)
merged_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [122]:
# Checking the number of mice in the DataFrame.
mouse_count = merged_df['Mouse ID'].count()  
mouse_count #1893


1893

In [153]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_time_df = merged_df.loc[:, ['Mouse ID', 'Timepoint']]
boolean = mouse_time_df['Mouse ID'].duplicated().any() # True
mouse_id_duplicates = mouse_time_df[mouse_time_df.duplicated('Mouse ID')]
mouse_id_duplicates


Unnamed: 0,Mouse ID,Timepoint
1,k403,5
2,k403,10
3,k403,15
4,k403,20
5,k403,25
...,...,...
1888,z969,25
1889,z969,30
1890,z969,35
1891,z969,40


# Optional: Get all the data for the duplicate mouse ID. 
id_duplicates_df = merged_df[merged_df.duplicated('Mouse ID')]
id_duplicates_df

In [160]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
id_duplicates_df = merged_df[merged_df.duplicated('Mouse ID')]
id_duplicates_df.head(12)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1
11,s185,Capomulin,Female,3,17,5,43.878496,0


In [156]:
# Checking the number of mice in the clean DataFrame.
mouse_check = id_duplicates_df['Mouse ID'].unique()
print(len(mouse_check)) #237

237


## Summary Statistics

In [167]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.
grouped_tumor_mean = id_duplicates_df.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"].rename('Tumor Volume Mean')
grouped_tumor_median = id_duplicates_df.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"].rename('Tumor Volume Median')
grouped_tumor_var = id_duplicates_df.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"].rename('Tumor Volume Variance')
grouped_tumor_stdev = id_duplicates_df.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"].rename('Tumor Volume StDev')
grouped_tumor_sem = id_duplicates_df.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"].rename('Tumor Volume SEM')


In [169]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Initialize Dictionary of lists
Regimen_Summary = {'Tumor Volume Mean': grouped_tumor_mean,
                   'Tumor Volume Median': grouped_tumor_median,
                   'Tumor Volume Variance': grouped_tumor_var,
                   'Tumor Volume StDev':grouped_tumor_stdev,
                   'Tumor Volume SEM': grouped_tumor_sem}

Regimen_Summary_df = pd.DataFrame(Regimen_Summary)
Regimen_Summary_df

Unnamed: 0_level_0,Tumor Volume Mean,Tumor Volume Median,Tumor Volume Variance,Tumor Volume StDev,Tumor Volume SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.148392,40.716428,25.434058,5.043219,0.352234
Ceftamin,53.831559,52.847559,34.725752,5.892856,0.476409
Infubinol,54.17316,53.244458,38.326085,6.190806,0.500497
Ketapril,56.805521,56.098998,60.485143,7.777219,0.609159
Naftisol,55.780565,54.234502,60.794406,7.797077,0.614496
Placebo,55.48127,53.354528,55.762387,7.467422,0.597872
Propriva,53.668609,52.663801,38.438982,6.199918,0.531639
Ramicane,39.627674,39.769552,23.213127,4.818,0.338157
Stelasyn,55.644586,54.328317,53.476333,7.312751,0.583621
Zoniferol,54.548052,53.287287,43.708157,6.611214,0.527632


## Bar Plots

In [9]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas. 


In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

plt.bar(x_axis, users, color='red', alpha=0.75, align="center")

# Give our chart some labels and a tile
plt.title("TITLE")
plt.xlabel("X-LABLE")
plt.ylabel("Y-LABLE")



## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Labels for the sections of our pie chart
labels = ["Humans", "Smurfs", "Hobbits", "Ninjas"]
# The values of each section of the pie chart
sizes = [220, 95, 80, 100]
# The colors of each section of the pie chart
colors = ["red", "orange", "lightcoral", "lightskyblue"]
# Tells matplotlib to seperate the "Humans" section from the others
explode = (0.2, 0, 0, 0)

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest



## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

time = []
tumor_vol = []
# Plot the line
plt.plot(time, tumor_vol)
plt.show()


In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

# The maximum x value for our chart will be 100
x_limit = 100
# List of values from 0 to 100 each value being 1 greater than the last
x_axis = np.arange(0, x_limit, 1)
# Create a random array of data that we will use for our y values
data = [random.random() for value in x_axis]

# Tells matplotlib that we want to make a scatter plot
# The size of each point on our plot is determined by their x value
plt.scatter(x_axis, data, marker="o", facecolors="red", edgecolors="black",
            s=x_axis, alpha=0.75)

## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
