## Observations and Insights

## Dependencies and starter code

In [None]:
%matplotlib inline

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import os

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
combined_results_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")

#rename columns for ease of use
combined_results_data = combined_results_data.rename(columns={"Drug Regimen":"Drug",
                                                              "Age_months":"Age (months)",
                                                              "Tumor Volume (mm3)":"Tumor Volume"})

#with TA and class help, group by Drug and Timepoint, then find overall values for Tumor Volume. This is much cleaner than
#individually completing these operations in their own cells
meanTumorVol_df = combined_results_data.groupby(["Drug", "Timepoint"]).mean()["Tumor Volume"]
medianTumorVol_df = combined_results_data.groupby(["Drug", "Timepoint"]).median()["Tumor Volume"]
variTumorVol_df = combined_results_data.groupby(["Drug", "Timepoint"]).var()["Tumor Volume"]
stdTumorVol_df = combined_results_data.groupby(["Drug", "Timepoint"]).std()["Tumor Volume"]
semTumorVol_df = combined_results_data.groupby(["Drug", "Timepoint"]).sem()["Tumor Volume"]

#create another new dataframe
overall_sum_df = pd.DataFrame({"Mean": meanTumorVol_df,
                                 "Median": medianTumorVol_df,
                                 "Variance": variTumorVol_df,
                                 "Standard Deviation": stdTumorVol_df,
                                 "Standard Error": semTumorVol_df})

overall_sum_df

In [None]:
#store unique value in a variable 
uniqueMice = combined_results_data["Mouse ID"].nunique()
#print("there are",[uniqueMice],"unique mice")

uniqueDrug = combined_results_data["Drug"].nunique()
#print("there are",[uniqueDrug],"unique drugs")

In [None]:
#group data by Gender
genderStats = combined_results_data.groupby("Sex")

#count total drugs by gender
totalCountGender = genderStats.nunique()["Drug"]
totalCountGender.head()

In [None]:
# Store the Mean Tumor Volume Data Grouped by Drug and Timepoint 
sex_df = combined_results_data.groupby(["Drug"]).count()["Sex"]

# Convert to DataFrame
sex_df = pd.DataFrame(sex_df)
#meanTumorVol_df = meanTumorVol_df.rename(columns={"Tumor Volume": "Mean for Tumor Volume"})

# Preview DataFrame
sex_df

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
# Filter the DataFrame down only to those columns to chart
drug_and_volume = meanTumorVol_df

drug_and_volume

In [None]:
#set up a dataframe with the Mean and Median columns 
summary_bar = overall_sum_df[["Mean", "Median"]]

#use df.plot() in order to create a bar chart of the data
summary_bar.plot(kind="bar", figsize=(20,5))

#format the bar chart
plt.title("Tumor Volume by Drug")
plt.xlabel("Drug at time point")
plt.ylabel("Tumor Volume")
plt.tight_layout()
plt.show()

#save the image
#plt.savefig("..images/pyBarTumorVol.png")

In [None]:
# Set x axis and tick locations
x_axis = np.arange(len(overall_sum_df))
tick_locations = [value for value in x_axis]

# Create a list indicating where to write x labels and set figure size to adjust for space recipe
plt.figure(figsize=(20,3))
plt.bar(x_axis, overall_sum_df["Mean"], color='g', alpha=0.5, align="center")
plt.xticks(tick_locations, uniqueDrug["Drug"], rotation="vertical")


# Set x and y limits recipe
plt.xlim(-.75, len(x_axis))
plt.ylim(0, max(overall_sum_df["Mean"])+10)

#further formatting
plt.title("Mean Tumor Volume per Drug")
plt.xlabel("Drug per timepoint")
plt.ylabel("Mean tumor volume")
plt.tight_layout()
plt.savefig("..images/pyBarTumorVol.png")
plt.show

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas 
drug_and_volume = combined_results_data[["Timepoint","Drug", "Tumor Volume"]]

#set the index to Drug so it can be used as the label
drug_and_volume = drug_and_volume.set_index("Drug")
drug_and_volume.head()


In [None]:
#use dataframe.plt() to create the bar chart 
drug_and_volume.plot(kind="bar", figsize=(20,3))

#set the context for the chart
plt.title("Mean Tumor Volume per Drug")
plt.xlabel("Drug per timepoint")
plt.ylabel("Mean tumor volume")
plt.show()
plt.tight_layout()

In [None]:
multi_plot = drug_and_volume.plot(kind="bar", figsize=(20,3))
multi_plot.set_xticklabels(drug_and_volume.iloc[:,0], rotation=45)

plt.show()
plt.tight_layout()

## Pie plots

In [None]:
#miceGender = combined_data_results
totalCountGender 
# Labels for the sections of our pie chart
labels = ["Male", "Female"]

# The colors of each section of the pie chart
colors = ["lightcoral", "lightskyblue"]

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
plt.title("Mice gender")
plt.pie(totalCountGender, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=90)
plt.savefig("../Images/PandaMice.png")
plt.axis("equal")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
df = pd.DataFrame(totalCountGender)
plot = df.plot.pie(y='Sex', figsize=(5, 5))
plt.savefig("../Images/PyMice.png")

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 
#Calculate the IQR and quantitatively determine if there are any potential outliers. 
# Store the variance Tumor Volume Data Grouped by Drug and Timepoint 
maxTumorVol_df = combined_results_data.groupby(["Timepoint"]).max()["Tumor Volume"]

# Convert to DataFrame
maxTumorVol_df = pd.DataFrame(maxTumorVol_df)

# Preview DataFrame
maxTumorVol_df

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
#create the dataframe
timeTumor = combined_results_data[combined_results_data["Drug"] == 'Capomulin']
timeTumor

In [None]:
timeTumorCap = timeTumor.iloc[:,[False, False, False, False, False, True, True, False]]
timeTumorCap

In [None]:
#set the x axis
x_axis = np.arange(len(timeTumorCap))
y_axis = timeTumorCap["Tumor Volume"]

In [None]:
# Create labels for the X and Y axis
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume")

# Have to plot our chart once again as it doesn't stick after being shown
plt.plot(x_axis, y_axis)
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
weightTumorCap = timeTumor.iloc[:,[False, False, False, False, True, False, True, False]]
weightTumorCap

In [None]:
# Scatter Plot of Data
x_values = weightTumorCap.iloc[:,[True,False]]
y_values = weightTumorCap.iloc[:,[False,True]]
#y_values = weightTumorCap['Tumor Volume']
year = weightTumorCap.iloc[:,0]
violent_crime_rate = crime_data.iloc[:,3]
plt.scatter(year,violent_crime_rate)
plt.xticks(year, rotation=90)
plt.scatter(x_values,y_values)
plt.legend()

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average 
#tumor volume for the Capomulin regimen
print(f"The correlation coefficient between mouse weight and average tumor volume is {round(st.pearsonr(weight,tumor volume)[0],2)}")