## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = 'data/Mouse_metadata.csv'
study_results_path = 'data/Study_results.csv'

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_results = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
mouse_results

In [None]:
# Checking the number of mice.

print(mouse_metadata["Mouse ID"].count())

In [None]:
#  Display the number of unique mice IDs in the data, and then check for any mouse ID with duplicate time points. 

timepoint_unique = mouse_results["Timepoint"].unique()
timepoint_unique

In [None]:
duplicate_mouse_ids = mouse_results.loc[mouse_results.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()
duplicate_mouse_ids


In [None]:
# Display the data associated with the duplicate mouse ID

duplicate_mouse_data = mouse_results.loc[mouse_results["Mouse ID"] == "g989"]
duplicate_mouse_data


In [None]:
# Create a new DataFrame where this data is removed. 
# Use this cleaned DataFrame for the remaining steps.

clean_mouse_data = mouse_results.loc[mouse_results["Mouse ID"] != "g989"]
clean_mouse_data


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumour volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumour volume. 
# Assemble the resulting series into a single summary dataframe.

In [None]:
# Display 
clean_mouse_data

In [None]:
# SUMMARY STATISTICS DATAFRAMES 1

# Group data by Drug Regimen
# Calculate the mean, median, variance, standard deviation, and SEM of the tumour volume for each regime

tumour_mean = clean_mouse_data.groupby("Drug Regimen")["Tumour Volume (mm3)"].mean()
tumour_median = clean_mouse_data.groupby("Drug Regimen")["Tumour Volume (mm3)"].median()
tumour_variance = clean_mouse_data.groupby("Drug Regimen")["Tumour Volume (mm3)"].var()
tumour_stdev = clean_mouse_data.groupby("Drug Regimen")["Tumour Volume (mm3)"].std()
tumour_sem = clean_mouse_data.groupby("Drug Regimen")["Tumour Volume (mm3)"].sem()

# Create DataFrame to summarize calculations

treatment_group_df = pd.DataFrame({
    "Mean": tumour_mean,
    "Median": tumour_median,
    "Variance": tumour_variance,
    "Std Dev": tumour_stdev,
    "SEM": tumour_sem
})

treatment_group_df


In [None]:
# SUMMARY STATISTICS DATAFRAMES 2

# Using the aggregation method, produce the same summary statistics in a single line

summary_table = clean_mouse_data.groupby("Drug Regimen").agg({"Tumour Volume (mm3)":["mean","median","var","std","sem"]})
summary_table


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

bar_plot = clean_mouse_data["Drug Regimen"].value_counts().plot.bar(width=0.7, zorder=3)  

# Set labels for axes

bar_plot.set_xlabel("Drug Regimen")
bar_plot.set_ylabel("Number of Mice Tested")
bar_plot.set_title("Tumour Response to Drug Regimen at Different Timepoints")

# Create a grid - using 'zorder' places grid behind the bars since we set their z-order higher

bar_plot.grid(zorder=0)


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.

counts = clean_mouse_data['Drug Regimen'].value_counts()
plt.bar(counts.index.values,counts.values)    # plt ID x and y needs to be defined
plt.xlabel("Drug Regimen")
plt.xticks(rotation=90)
plt.ylabel("Number of Mice Tested")
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

counts = clean_mouse_data.Sex.value_counts()
counts.plot(kind="pie",autopct='%1.1f%%')
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

counts = clean_mouse_data.Sex.value_counts()
plt.pie(counts.values,labels=counts.index.values,autopct='%1.1f%%')
plt.ylabel("Sex")
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse

max_tumour = clean_mouse_data.groupby(["Mouse ID"])['Timepoint'].max()
max_tumour = max_tumour.reset_index()

# Merge this group df with the original dataframe to get the tumour volume at the last timepoint

merged_data = max_tumour.merge(clean_mouse_data,on=['Mouse ID','Timepoint'],how="left")


In [None]:
# Put treatments into a list for for loop (and later for plot labels)

treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)

tumour_vol_list = []

# Calculate the IQR and quantitatively determine if there are any potential outliers.

for drug in treatment_list:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    final_tumour_vol = merged_data.loc[merged_data["Drug Regimen"] == drug, 'Tumour Volume (mm3)']
    
    # Add subset 
    
    tumour_vol_list.append(final_tumour_vol)
    
    # Determine outliers using upper and lower bounds
    
    quartiles = final_tumour_vol.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    outliers = final_tumour_vol.loc[(final_tumour_vol < lower_bound) | (final_tumour_vol > upper_bound)]
    print(f"{drug}'s potential outliers: {outliers}")


In [None]:
# Generate a box plot of the final tumour volume of each mouse across four regimens of interest

orange_out = dict(markerfacecolor='red',markersize=12)
plt.boxplot(tumour_vol_list, labels = treatment_list,flierprops=orange_out)
plt.ylabel('Final Tumour Volume (mm3)')
plt.show()


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumour volume vs. time point for a mouse treated with Capomulin

capomulin_table = clean_mouse_data.loc[clean_mouse_data['Drug Regimen'] == "Capomulin"]
mousedata = capomulin_table.loc[capomulin_table['Mouse ID']== 'l509']
plt.plot(mousedata['Timepoint'],mousedata['Tumour Volume (mm3)'])
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumour Volume (mm3)')
plt.title('Capomulin treatment of mouse l509')
plt.show()


In [None]:
# Generate a scatter plot of average tumour volume vs. mouse weight for the Capomulin regimen

capomulin_table = clean_mouse_data.loc[clean_mouse_data['Drug Regimen'] == "Capomulin"]
capomulin_average = capomulin_table.groupby(['Mouse ID']).mean()
plt.scatter(capomulin_average['Weight (g)'],capomulin_average['Tumour Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumour Volume (mm3)')
plt.show()



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumour volume for the Capomulin regimen

corr=round(st.pearsonr(capomulin_average['Weight (g)'],capomulin_average['Tumour Volume (mm3)'])[0],2)
print(f"The correlation between mouse weight and the average tumor volume is {corr}")
model = st.linregress(capomulin_average['Weight (g)'],capomulin_average['Tumour Volume (mm3)'])

y_values = capomulin_average['Weight (g)']*model[0]+model[1]
plt.scatter(capomulin_average['Weight (g)'],capomulin_average['Tumour Volume (mm3)'])
plt.plot(capomulin_average['Weight (g)'],y_values,color="red")
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumour Volume (mm3)')
plt.show()
