In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [None]:
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
study_results.head()

In [None]:
mouse_metadata.head()

In [None]:
# Combining the data into a single dataset
animal_study= pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID","Mouse ID"])
animal_study

In [None]:
# Checking the number of mice
animal_study["Mouse ID"].nunique()

In [None]:
 # Creating a clean DataFrame by dropping the duplicate mouse by its ID.
animal_study.drop_duplicates()
animal_study

# Summary Statistics

In [None]:
# Generating a summary table of mean, median, variance, standard deviation, and SEM(standard error) of the tumor volume for each regimen
mean = animal_study.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
median = animal_study.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
variance = animal_study.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
std = animal_study.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
sem = animal_study.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()

#assembling the results into a dataframe
summary_df = pd.DataFrame({"Mean": mean, "Median": median, "Variance": variance, "Standard Deviation": std, 
                          "Standard Error": sem})
summary_df

# Bar and Pie Charts
* Generate a bar plot that shows the total number of measurements taken on each drug regimen using pyplot and pandas.
* Generate a pie plot showing the distribution of female versus male mice using pyplot and pandas


Bar Chart: Pyplot Method

In [None]:
#first,use the index.tolist() fuction to return a list of the names of the regimen
regimen= summary_df.index.tolist()
regimen

In [None]:
#second, use the to.list() function to return a list of the total measurement for each regimen
count= (animal_study.groupby(['Drug Regimen']).count())
total_measurement= count['Mouse ID'].tolist()
total_measurement

In [None]:
#create bar chart based on the data above
plt.figure(figsize=(10,4))
plt.bar(regimen, total_measurement, color="purple", align="center")

#chart title and x label
plt.title("Total Measurement for each Treatment")
plt.xlabel("Drug Regimen")

#save the image
plt.savefig("PyPlotBar.png")

Bar Chart: Pandas Method

In [None]:
regimen_p= (animal_study['Drug Regimen'].value_counts()).sort_index()
regimen_p

In [None]:
regimen_p.plot(kind="bar", facecolor="red", figsize=(10,4))

#chart title and x label
plt.title("Total Measurement for each Treatment")
plt.xlabel ("Drug Regimen")

#save the image
plt.savefig("PandasBar.png")

Pie Chart: Pyplot Method

In [None]:

#first use the groupby and count function to find the distribution of female and male mice
count= (animal_study.groupby(['Sex']).count())

#second, use the to.list() function to return a list of the sex and distribution
sex= count.index.tolist()
sex_count= count['Mouse ID'].tolist()

colors= ['purple', 'cyan']
explode= (0.1,0)
sex, sex_count

In [None]:
plt.pie(sex_count, labels=sex, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")
plt.title= ("Mice Gender Distribution")

#save the image
plt.savefig("PyPlotPie.png")

Pie Chart: Pandas Method

In [None]:
#Spliting the data into groups based upon 'sex' and 'mouse id'
sex_p= (animal_study.groupby(['Sex'])['Mouse ID'].count())
sex_p

In [None]:
#Creating a pie chart based on the gender segregated data above
colors= ['yellow', 'red']

gender_distribution = sex_p.plot(kind="pie", y='Mouse ID', title=("Mice Gender Distribution"),autopct= '%1.1f%%', colors=colors)
gender_distribution.set_ylabel("")
plt.axis("equal")

#save the image
plt.savefig("PandasPie.png")

# Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  Capomulin, Ramicane, Infubinol, and Ceftamin
druglist= ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

#creating a dataframe that contains only the listed drugs above
CRIC_drug= animal_study[animal_study['Drug Regimen'].isin(druglist)]
CRIC_drug

In [None]:
#Getting the last (greatest) timepoint for each mouse
last_time= (CRIC_drug.groupby(['Mouse ID'])['Timepoint'].max()).reset_index()
#last_time['Timepoint'].value_counts()
last_time

In [None]:
# Combining the data into a single dataset
final_tumor= pd.merge(last_time, animal_study, how="left", on=["Mouse ID","Mouse ID"])
final_tumor

In [None]:
# Rename Timepoint_x
final_tumor = final_tumor.rename(columns={"Timepoint_x": "Greatest Timepoint"})
final_tumor

In [None]:
final_tumor= final_tumor[['Mouse ID','Tumor Volume (mm3)',"Greatest Timepoint"]]

last_tumor= (final_tumor["Greatest Timepoint"].last()).reset_index()
last_tumor

# Line and Scatter Plots

In [None]:
# Generating a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_df = animal_study.loc[animal_study["Drug Regimen"] == "Capomulin"]
capomulin_df

In [None]:
r157_df = capomulin_df.loc[capomulin_df["Mouse ID"] == "r157"]
r157_df

In [None]:
r157_df= r157_df[['Timepoint','Tumor Volume (mm3)']]

timepoint= r157_df['Timepoint']
tumor_volume= r157_df['Tumor Volume (mm3)']

r157_df

In [None]:
plt.plot(tumor_volume, timepoint, marker ='o', color='magenta')

#plt.title("Time Series of Tumor Volume for Mouse r157 treated with Capomulin")
plt.xlabel("Tumor Volume (mm3)")
plt.ylabel("Timepoint")

#save line graph
plt.savefig("Time Series of Tumor Volume.png")

In [None]:
# Generating a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
capomulin_weight= capomulin_df[["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]]
capomulin_weight

In [None]:
volume_mean = (capomulin_weight.groupby(['Mouse ID','Weight (g)'])['Tumor Volume (mm3)'].mean()).reset_index()

# Rename Tumor Volume column
volume_mean = volume_mean.rename(columns={"Tumor Volume (mm3)": "Average Volume"})
volume_mean

In [None]:
# Set the 'Mouse ID' as our index
volume_mean = volume_mean.set_index('Mouse ID')

#Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
volume_mean.plot(kind="scatter", x="Weight (g)", y="Average Volume", grid=True, figsize=(4,4),color='green',title="Weight Vs. Average Tumor Volume")

#save image
plt.savefig("Weight vs Average Tumor Volume.png")

# Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen
weight= volume_mean['Weight (g)']
vol_mean= volume_mean['Average Volume']

correlation = st.pearsonr(weight,vol_mean)
print(f"The correlation between mouse weight and average tumor volume for the Capomulin regimen is {round(correlation[0],2)}")

In [None]:
from scipy.stats import linregress

In [None]:
x_values = weight
y_values = vol_mean
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Weight')
plt.ylabel('Average Volume')

#save image
plt.savefig("Linear Regression.png")