In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import random

In [None]:
def clean_headers(df):
    df.columns = df.columns.str.replace(" ", "_")
    return df

def select_random_mouse(results_cleaned, drug_regimen):
    drug_data = results_cleaned[results_cleaned['Drug_Regimen'] == drug_regimen]
    mouse_ids = drug_data['Mouse_ID'].unique()
    random_mouse_id = random.choice(mouse_ids)
    
    return random_mouse_id

# Prepare the data

In [None]:
#import csvs as dfs
metadata_df = pd.read_csv('data/Mouse_metadata.csv')
results_df = pd.read_csv('data/Study_results.csv')

clean_headers(metadata_df)
clean_headers(results_df)

In [None]:
#merge csvs
results_merged = pd.merge(metadata_df, results_df, how = 'outer', on = 'Mouse_ID')

print('Merged results:')
results_merged

In [None]:
#display number of unique mice IDs

unique_id_count = results_merged['Mouse_ID'].nunique()

print(f"There are {unique_id_count} unique Mouse ID's")

In [None]:
#check for mouse ID with duplicate time points.
#display data associated with mouseID

filt = results_merged.duplicated(subset = ['Mouse_ID', 'Timepoint'], keep = False)

duplicate_data = results_merged[filt]
num_duplicates = duplicate_data['Mouse_ID'].nunique()

if num_duplicates == 1:
    print(f'There was 1 unique ID detected with duplicated timepoints.')
elif num_duplicates <1:
    print(f"There were {num_duplicates} unique ID's detected with duplicated timepoints.")
else:
    print("There were no ID's detected with duplicated timepoints.")
print('Duplicated data:')

duplicate_data

In [None]:
#create new resultes_cleaned where the data is removed and use cleaned resultes_cleaned for remaining steps.

results_cleaned = results_merged.query('Mouse_ID != "g989"')


In [None]:
#display the updated number of unique ids for the cleaned data.

unique_id_count = results_cleaned['Mouse_ID'].nunique()

print(f"There are now {unique_id_count} unique Mouse ID's")

# Generate Summary Statistics

In [None]:
#create new df for summary statistics

drug_group = results_cleaned.groupby(by = 'Drug_Regimen')
summary_stats = pd.DataFrame(
    columns=[
        'Mean', 
        'Median', 
        'Variance', 
        'Stdev', 
        'SEM'
        ])

for col, drug in drug_group:
    mean = drug['Tumour_Volume_(mm3)'].mean()
    median = drug['Tumour_Volume_(mm3)'].median()
    variance = drug['Tumour_Volume_(mm3)'].var()
    std = drug['Tumour_Volume_(mm3)'].std()
    sem = drug['Tumour_Volume_(mm3)'].sem()
    
    summary_stats.loc[col] = [mean, median, variance, std, sem]
    
print('Tumor volume statistics by drug:')
summary_stats.round(2)

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line

agg_summary = pd.DataFrame(summary_stats.agg
    ({
    'Mean':'mean', 
    'Median':'median', 
    'Variance':'var', 
    'Stdev':'std', 
    'SEM':'sem'
    }))

print("Aggregated summary of statistics for all drugs")
agg_summary

# Create Bar Charts and Pie Charts

In [None]:
#Prepare data for charts

time_points_drug = results_cleaned.groupby("Drug_Regimen")["Timepoint"].count()

time_points_drug

In [None]:
#bar chart 1: total number of time points for all mice tested for each drug throughout the study
#create using DataFrame.plot()

fig_size = (8, 5)
time_points_drug.plot(kind = "bar", 
                             title = "Total Time Points per Drug Regimen", 
                             figsize = fig_size,
                             color = 'green'
                             )

In [None]:
#bar chart 2: Create bar chart with matplotlib.pyplot
fig, chart_2 = plt.subplots(figsize = fig_size)

chart_2.bar(time_points_drug.index, time_points_drug.values, color = 'green')

chart_2.set_title("Total Time Points per Drug Regimen")

chart_2.set_xlabel("Drug Regimen")
plt.xticks(rotation = 90)

chart_2.set_ylabel("Timepoint Count")

plt.show()

In [None]:
sex_timepoint_grp = pd.DataFrame(results_cleaned.groupby(['Sex']).count().loc[:,'Timepoint'])

sex_timepoint_grp

In [None]:
sex_timepoint_grp.plot.pie(y = 'Timepoint', labels = sex_timepoint_grp.index, autopct = '%1.1f%%')

In [None]:
plt.pie(sex_timepoint_grp['Timepoint'], labels = sex_timepoint_grp.index, autopct='%1.1f%%')

plt.show()

# Calculate Quartiles, Find Outliers, and Create a Box Plot

In [None]:
mouseID_timepoint_max = results_cleaned.groupby(["Mouse_ID", "Drug_Regimen"]).tail(1)

print("Data containing the final timepoint for each unique Mouse_ID")
mouseID_timepoint_max

In [None]:
treatments = ['Ramicane', 'Capomulin', 'Infubinol', 'Ceftamin']
boxplot_groups = mouseID_timepoint_max[mouseID_timepoint_max['Drug_Regimen'].isin(treatments)]

boxplot_groups 

In [None]:
#Create a list that holds the treatment names as well as a second, empty list to hold the tumour volume data.
treatments = ['Ramicane', 'Capomulin', 'Infubinol', 'Ceftamin']
tumor_vol = []
outliers = []
iqr_list = []

for treatment in treatments:
    regimen = boxplot_groups[boxplot_groups['Drug_Regimen'] == treatment]['Tumour_Volume_(mm3)']
    tumor_vol.append(regimen.tolist())

    quartiles = regimen.quantile([.25, .5, .75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq
    iqr_list.append(iqr)
    lower_bound = lowerq - (1.5 * iqr)
    upper_bound = upperq + (1.5 * iqr)
    outlier = regimen[(regimen < lower_bound) | (regimen > upper_bound)].tolist()
    outliers.append(outlier)

outliers_df = pd.DataFrame(outliers)
outliers_df.columns = ['Outliers']
iqr_df = pd.DataFrame(iqr_list)
iqr_df.columns = ['IQR']
result_df = pd.concat([iqr_df,outliers_df], axis = 1)
result_df.index = treatments

print("List of IQR and Outliers Identified")
result_df.round(2)

In [None]:
outliers_df = pd.DataFrame(outliers)
outliers_df.columns = ['Outliers']
iqr_df = pd.DataFrame(iqr_list)
iqr_df.columns = ['IQR']
result_df = pd.concat([iqr_df,outliers_df], axis = 1)
result_df.index = treatments

outliers_df.round(2)

In [None]:
#generate a box plot that shows the distribution of the final tumour volume for all the mice in each treatment group. 
#Highlight any potential outliers in the plot by changing their color and style.

fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumour Volume Across Selected Regimens')
ax1.set_ylabel('Tumour Volume (mm3)')
ax1.boxplot(tumor_vol, labels = treatments)

plt.show()


# Create line and scatter plot

In [None]:
#Select a mouse that was treated with Capomulin, and generate a line plot of tumour volume versus time point for that mouse.

drug_regimen = 'Capomulin'
mouse_id = 'x401'

mouse_data = results_cleaned[(results_cleaned['Mouse_ID'] == mouse_id) & (results_cleaned['Drug_Regimen'] == 'Capomulin')]

plt.plot(mouse_data['Timepoint'], mouse_data['Tumour_Volume_(mm3)'], '-o')
plt.xlabel('Timepoint')
plt.ylabel('Tumour Volume (mm3)')
plt.title(f'Tumour Volume vs Timepoint for Mouse {mouse_id}')

plt.show()

Try with a random mouse

In [None]:
drug_regimen = 'Capomulin'

random_mouse_id = select_random_mouse(results_cleaned, drug_regimen)

mouse_data = results_cleaned[(results_cleaned['Mouse_ID'] == random_mouse_id) & (results_cleaned['Drug_Regimen'] == 'Capomulin')]

plt.plot(mouse_data['Timepoint'], mouse_data['Tumour_Volume_(mm3)'], '-o')
plt.xlabel('Timepoint')
plt.ylabel('Tumour Volume (mm3)')
plt.title(f'Tumour Volume vs Timepoint for Mouse {random_mouse_id}')

print(f'Drug of choice: {drug_regimen}.')
plt.show()

In [None]:
# Filter the data for the Capomulin regimen

capomulin_data = results_cleaned[results_cleaned['Drug_Regimen'] == 'Capomulin']

grouped_data = capomulin_data.groupby('Mouse_ID').mean()

plt.scatter(grouped_data['Weight_(g)'], grouped_data['Tumour_Volume_(mm3)'])
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Average Tumour Volume (mm3)')
plt.title('Average Tumour Volume vs Mouse Weight for Capomulin Regimen')

plt.show()

# Calculate Correlation and Regression

In [None]:
#Calculate the correlation coefficient and linear regression model between mouse weight and average tumour volume for the Capomulin treatment.

grouped_data = capomulin_data.groupby('Mouse_ID').mean()

x_values = grouped_data['Weight_(g)']
y_values = grouped_data['Tumour_Volume_(mm3)']

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
correlation = st.pearsonr(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope, 2)) + "x + " + str(round(intercept, 2))

plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-", linewidth = 1)
plt.annotate(str(line_eq),(15, 45), fontsize=10, color="red")
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Average Tumour Volume (mm3)')
plt.title('Average Tumour Volume vs Mouse Weight for Capomulin Regimen')

print(f"The correlation coefficient is: {round(correlation[0], 2)}")
plt.show()