## Observations and Insights 

#Resources
Lambda - https://www.programiz.com/python-programming/anonymous-function#:~:text=Use%20of%20Lambda%20Function%20in%20python%20We%20use,built-in%20functions%20like%20filter%20%28%29%2C%20map%20%28%29%20etc.https://www.programiz.com/python-programming/anonymous-function#:~:text=Use%20of%20Lambda%20Function%20in%20python%20We%20use,built-in%20functions%20like%20filter%20%28%29%2C%20map%20%28%29%20etc.
DataFrame Merge - https://www.geeksforgeeks.org/joining-two-pandas-dataframes-using-merge/https://www.geeksforgeeks.org/joining-two-pandas-dataframes-using-merge/
AGG - https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.htmlhttps://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html
BoxPlots - https://www.askpython.com/python/examples/boxplotshttps://www.askpython.com/python/examples/boxplots
         - (https://matplotlib.org/gallery/pyplots/boxplot_demo_pyplot.html#sphx-glr-gallery-pyplots-boxplot-demo-pyplot-py)  
Correlation - https://realpython.com/numpy-scipy-pandas-correlation-python/#linear-correlationhttps://realpython.com/numpy-scipy-pandas-correlation-python/#linear-correlation

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
from scipy.stats import sem


In [None]:
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_df = pd.merge(mouse_metadata,study_results, how='outer', on='Mouse ID')

# Display the data table for preview
mouse_df.head()

In [None]:
# Checking the number of mice.
mouse_df['Mouse ID'].nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dup_mouse = mouse_df.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)
dup_mouse_data = (mouse_df.loc[dup_mouse.values, ['Mouse ID', 'Timepoint']])
dup_mouse_ID = dup_mouse_data['Mouse ID'].unique()

print(dup_mouse_ID)

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
dup_mouse_data = (mouse_df.loc[dup_mouse.values, ['Mouse ID', 'Timepoint']])
dup_mouse_data

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_df_cln = mouse_df.drop(mouse_df.loc[mouse_df['Mouse ID'] == 'g989'].index) #, inplace=True)
mouse_df_cln

In [None]:
# Checking the number of mice in the clean DataFrame.
mouse_df_cln['Mouse ID'].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Calculate the mean, median, standard deviation, and sem for each drug
t_mean = mouse_df_cln.groupby('Drug Regimen').mean()['Tumor Volume (mm3)']
t_median = mouse_df_cln.groupby('Drug Regimen').median()['Tumor Volume (mm3)']
t_variance = mouse_df_cln.groupby('Drug Regimen').var()['Tumor Volume (mm3)']
t_std_dev = mouse_df_cln.groupby('Drug Regimen').std()['Tumor Volume (mm3)']
t_sem = mouse_df_cln.groupby('Drug Regimen').sem()['Tumor Volume (mm3)']
#Summary dataframe
mean_df = pd.DataFrame(t_mean)
sum_grpd_df = pd.DataFrame({'Mean': t_mean, 'Median': t_median,
                                   'Std_Dev': t_std_dev, 'SEM': t_sem})
#Display summary dataframe
sum_grpd_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
tblsum_df= mouse_df_cln.groupby('Drug Regimen').agg(Mean = ('Tumor Volume (mm3)','mean'),
                                                        Median = ('Tumor Volume (mm3)','median'),
                                                        Variance = ('Tumor Volume (mm3)','var'),
                                                        STD_Dev = ('Tumor Volume (mm3)','std'),
                                                        SEM = ('Tumor Volume (mm3)','sem'))
#Display summary dataframe
tblsum_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
bchrt_data = mouse_df_cln.groupby('Drug Regimen').count()['Tumor Volume (mm3)']
bchrt_plt_data = bchrt_data.plot.bar(rot='vertical',title='Number of Measurements for Each Treatment Regimen')
bchrt_plt_data.set_xlabel("Drug Regimen")
bchrt_plt_data.set_ylabel("Number of Measurements")


In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
bchrt_data = mouse_df_cln.groupby('Drug Regimen').count()['Tumor Volume (mm3)']
bchrt_plt_data = bchrt_data.plot.bar(rot='vertical',title='Number of Measurements for Each Treatment Regimen')

plt.xdata = bchrt_plt_data.plot
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Measurements')
plt.title('Number of Measurements for Each Treatment Regimen')
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
p_gndr = mouse_df_cln.groupby('Sex').count()['Mouse ID']
pie_plot = p_gndr.plot.pie(ylabel='Sex',title='Distribution of Female vs Male Mice',startangle = 120,autopct = '%1.2f%%',fontsize=16)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
p_gndr = mouse_df_cln.groupby('Sex').count()['Mouse ID']
plt.pie(p_gndr, labels = p_gndr.index, autopct="%1.2f%%", shadow=True, startangle=120)
plt.title('Distribution of Female vs Male Mice')
plt.ylabel('Sex')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
maxtp_df = pd.DataFrame(mouse_df_cln.groupby('Mouse ID')['Timepoint'].max().sort_values()).reset_index().rename(columns={'Timepoint': 'max_timepoint'})

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
mrg_mouse_df_cln = pd.merge(mouse_df_cln, maxtp_df, on='Mouse ID')
mrg_mouse_df_cln.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drug_lst = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
#Data frame for plot
drugs_df = mrg_mouse_df_cln[mrg_mouse_df_cln['Drug Regimen'].isin(drug_lst)]
#Last Value
lst_tumor_df = drugs_df.groupby(['Drug Regimen','Mouse ID']).agg(\
        # last value of the 'Tumor Volume (mm3)'
        lst_tumor_size=('Tumor Volume (mm3)',lambda x: x.iloc[-1])).round(3)

# View the groupby dataframe
lst_tumor_df

In [None]:
# Reshape dataframe
lst_tumor_trans_df = lst_tumor_df.stack(level=0).unstack(level=0)
# View to confirm transpose
lst_tumor_trans_df.head()

In [None]:
# Create empty list to fill with tumor vol data (for plotting)
boxplot_list = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
counter = 0
for drug in drug_lst:
    quartiles = lst_tumor_trans_df[drug].quantile([.25,.5,.75]).round(2)
    lowerq = quartiles[0.25].round(2)
    upperq = quartiles[0.75].round(2)
    iqr = round(upperq-lowerq,2)
    lower_bound = round(lowerq - (1.5*iqr),2)
    upper_bound = round(upperq + (1.5*iqr),2)
    
    # Check for first pass in loop, prints a blank ine for seperation
    if counter == 0:
        print()
    print(f"{drug} IQR data is:")
    print(f"Lower quartile of {drug} is: {lowerq}")
    print(f"Upper quartile of {drug} is: {upperq}")
    print(f"Interquartile range of {drug} is: {iqr}")
    print(f"Median of {drug} is: {quartiles[0.5]} ")
    print(f"Values below {lower_bound} for {drug} could be outliers.")
    print(f"Values above {upper_bound} for {drug} could be outliers.")
    print()
    counter += 1

    # Determine outliers using upper and lower bounds

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
# Iterate through the list of the four drugs names using them to select the same column names
# and appending to the boxplot_list
for drug in drug_lst:
    boxplot_list.append(list(lst_tumor_trans_df[drug].dropna()))
 
# Plot the list of lists using a boxplot utilizing the same y-axis for all plots
fig1, ax = plt.subplots(figsize=(9,7))
ax.set_title('Final measured Tumor volume by Drug Regimen')
ax.set_xlabel('Drug Regimen')
ax.set_ylabel('Tumor Vol (mm3)')
ax.boxplot(boxplot_list,notch=0,sym='gD')
plt.xticks([1,2,3,4],drug_lst)

plt.show()

## Line and Scatter Plots

In [None]:
# Identify data points of a mouse treated with Capomulin
cap_mouse = mouse_df_cln.loc[mouse_df_cln['Mouse ID'] == 'm957']

# Plot a line chart with the time values on the x-axis and the tumor volume values on the y-axis
# marker = 'o' adds circles on each data point for visibility
plt.plot(cap_mouse['Timepoint'],cap_mouse['Tumor Volume (mm3)'], marker = 'o')

# Add labels and title to plot
plt.xlabel("Time (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin Treatment of Mouse m957")

# Display plot
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(mouse_df_cln['Mouse_weight'],mouse_df_cln['Tumor_vol_mean'])
print(f"The correlation between both factors is {round(correlation[0],2)}")

In [None]:
cap_df = mouse_df_cln.loc[mouse_df_cln['Drug Regimen'] == 'Capomulin']
avg_vol_df = pd.DataFrame(cap_df.groupby('Mouse ID')['Tumor Volume (mm3)'].mean().sort_values()).reset_index().rename(columns={'Tumor Volume (mm3)': 'avg_tumor_vol'})
avg_vol_df = pd.merge(cap_df, avg_vol_df, on='Mouse ID')
final_avg_vol_df = avg_vol_df[['Weight (g)', 'avg_tumor_vol']].drop_duplicates()
final_avg_vol_df
x = final_avg_vol_df['Weight (g)']
y = final_avg_vol_df['avg_tumor_vol']

correlation = st.pearsonr(x,y)
print(f'''The correlation between weight and average tumor volume
on the Capomulin regimen is {round(correlation[0],4)}.''')

(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
reg_vals = x * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Plot linear regression on top of scatter plot
plt.scatter(x,y)
plt.plot(x,regress_values,"r-")

# Annotate linear regression
plt.annotate(line_eq,(20,36),fontsize=15,color="red")

# Add labels and title to plot
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title('Average Tumor Volume by Weight')
plt.show()