## Observations and Insights 

In [242]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
mouse_df = pd.DataFrame(mouse_metadata)
mouse_df
study_results = pd.read_csv(study_results_path)
results_df =pd.DataFrame(study_results)
#results_df.head()

#Combine the data into a single dataset
merged_df = pd.merge(mouse_df, results_df,how="left",on="Mouse ID")
merged_df.head()


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [243]:
# Checking the number of mice in the DataFrame.
mouse_count = merged_df['Mouse ID'].unique()  
print(len(mouse_count)) #249


249


In [244]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

mouse_time_df = merged_df.loc[:, ['Mouse ID', 'Timepoint']]
dup_mouse_time = mouse_time_df.duplicated(subset=['Mouse ID','Timepoint'])
#mouse_time_boolean_df = pd.DataFrame(dup_mouse_time)
duplicate_mouse_ids = merged_df.loc[dup_mouse_time,'Mouse ID']
duplicate_mouse_ids.unique()


array(['g989'], dtype=object)

# Optional: Get all the data for the duplicate mouse ID.

In [245]:
duplicate_mouse_data = merged_df.loc[merged_df["Mouse ID"] == "g989"]
duplicate_mouse_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [262]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean = merged_df.drop_duplicates(["Mouse ID", "Timepoint"])
clean_df = pd.DataFrame(clean)
clean_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [260]:
# Checking the number of mice in the clean DataFrame.
mouse_check = clean_df['Mouse ID'].unique()
print(len(mouse_check)) #248

249


## Summary Statistics

In [248]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.
grouped_tumor_mean = id_duplicates_df.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"].rename('Tumor Volume Mean')
grouped_tumor_median = id_duplicates_df.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"].rename('Tumor Volume Median')
grouped_tumor_var = id_duplicates_df.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"].rename('Tumor Volume Variance')
grouped_tumor_stdev = id_duplicates_df.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"].rename('Tumor Volume StDev')
grouped_tumor_sem = id_duplicates_df.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"].rename('Tumor Volume SEM')


In [249]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Initialize Dictionary of lists
Regimen_Summary = {'Tumor Volume Mean': grouped_tumor_mean,
                   'Tumor Volume Median': grouped_tumor_median,
                   'Tumor Volume Variance': grouped_tumor_var,
                   'Tumor Volume StDev':grouped_tumor_stdev,
                   'Tumor Volume SEM': grouped_tumor_sem}

Regimen_Summary_df = pd.DataFrame(Regimen_Summary)
Regimen_Summary_df

Unnamed: 0_level_0,Tumor Volume Mean,Tumor Volume Median,Tumor Volume Variance,Tumor Volume StDev,Tumor Volume SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.148392,40.716428,25.434058,5.043219,0.352234
Ceftamin,53.831559,52.847559,34.725752,5.892856,0.476409
Infubinol,54.17316,53.244458,38.326085,6.190806,0.500497
Ketapril,56.805521,56.098998,60.485143,7.777219,0.609159
Naftisol,55.780565,54.234502,60.794406,7.797077,0.614496
Placebo,55.48127,53.354528,55.762387,7.467422,0.597872
Propriva,53.668609,52.663801,38.438982,6.199918,0.531639
Ramicane,39.627674,39.769552,23.213127,4.818,0.338157
Stelasyn,55.644586,54.328317,53.476333,7.312751,0.583621
Zoniferol,54.548052,53.287287,43.708157,6.611214,0.527632


## Bar Plots

In [250]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas. 

# Filter the DataFrame down only to those columns to chart
state_and_inches = rain_df[["State","Inches"]]

# Set the index to be "State" so they will be used as labels
state_and_inches = state_and_inches.set_index("State")

# Use DataFrame.plot() in order to create a bar chart of the data
state_and_inches.plot(kind="bar", figsize=(20,3))

# Set a title for the chart
plt.title("Average Rain Per State")

plt.show()
plt.tight_layout()


NameError: name 'rain_df' is not defined

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

plt.bar(x_axis, users, color='red', alpha=0.75, align="center")

# Give our chart some labels and a tile
plt.title("TITLE")
plt.xlabel("X-LABLE")
plt.ylabel("Y-LABLE")



## Pie Plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Labels for the sections of our pie chart
labels = ["Humans", "Smurfs", "Hobbits", "Ninjas"]
# The values of each section of the pie chart
sizes = [220, 95, 80, 100]
# The colors of each section of the pie chart
colors = ["red", "orange", "lightcoral", "lightskyblue"]
# Tells matplotlib to seperate the "Humans" section from the others
explode = (0.2, 0, 0, 0)

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest



## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

time = []
tumor_vol = []
# Plot the line
plt.plot(time, tumor_vol)
plt.show()


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

# The maximum x value for our chart will be 100
x_limit = 100
# List of values from 0 to 100 each value being 1 greater than the last
x_axis = np.arange(0, x_limit, 1)
# Create a random array of data that we will use for our y values
data = [random.random() for value in x_axis]

# Tells matplotlib that we want to make a scatter plot
# The size of each point on our plot is determined by their x value
plt.scatter(x_axis, data, marker="o", facecolors="red", edgecolors="black",
            s=x_axis, alpha=0.75)

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
