## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import csv
import os
import numpy as np

# Study data files
mouse_metadata_path = os.path.join("data","Mouse_metadata.csv")
study_results_path = os.path.join("data","Study_results.csv")

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_metadata

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [2]:
# # test to use np.mean to calculate average
# mean_numpy2 = np.mean(mouse_metadata)
# print(f"The average regimen is {mean_numpy2}")

In [3]:
meta_value_counts = mouse_metadata['Mouse ID'].value_counts()
meta_value_counts

x581    1
i477    1
j246    1
t573    1
k382    1
       ..
m133    1
c264    1
v766    1
x822    1
r554    1
Name: Mouse ID, Length: 249, dtype: int64

In [4]:
study_counts = study_results['Mouse ID'].value_counts()
study_counts

g989    13
l471    10
g497    10
w575    10
e584    10
        ..
h428     1
l872     1
d133     1
v199     1
x226     1
Name: Mouse ID, Length: 249, dtype: int64

In [5]:
# # Merge Metadata and Study dataframes using outer function on column 'Mouse ID'
merged_df = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')
# merged_df_counts = merged_df['Mouse ID'].value_counts()
# merged_df_counts
merged_df.to_csv('merged_df.csv')

In [6]:
# Combine the data into a single dataset
merged_df.head(300)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
295,b742,Capomulin,Male,7,21,25,38.617684,0
296,b742,Capomulin,Male,7,21,30,39.167949,0
297,b742,Capomulin,Male,7,21,35,39.847576,0
298,b742,Capomulin,Male,7,21,40,38.255627,0


In [7]:
merged_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [8]:
# Try sort_values before drop_duplicates()
merged_df.sort_values("Mouse ID", inplace = True)
merged_df["Mouse ID"].count()


1893

In [9]:
#merged_df.loc["g989"]

In [19]:
merged_drop_timepoint_df = merged_df.drop_duplicates("Timepoint")
merged_drop_timepoint_df['Mouse ID'].unique()

array(['a203'], dtype=object)

In [14]:
merged_drop_df = merged_df.drop_duplicates(subset=["Mouse ID","Timepoint"])
#merged_drop_df.loc[merged_df["Mouse ID"] == "g989"]
merged_drop_df.to_csv("dropped.csv")
merged_drop_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
460,a203,Infubinol,Female,20,23,30,59.523197,1
461,a203,Infubinol,Female,20,23,35,61.931650,2
459,a203,Infubinol,Female,20,23,25,56.793208,1
458,a203,Infubinol,Female,20,23,20,55.173336,1
457,a203,Infubinol,Female,20,23,15,52.777870,1
...,...,...,...,...,...,...,...,...
1887,z969,Naftisol,Male,9,30,20,57.898778,2
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4


In [15]:
len(merged_drop_df["Mouse ID"].unique())

249

In [None]:
#merged_df.loc[merged_df.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"]


In [None]:
#merged_df.to_csv('merged_df_dropped_duplicated.csv')

In [None]:
#test_df = merged_df.loc[merged_df.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
#test_df

In [None]:
#len(merged_df['Mouse ID'].unique())

In [None]:
# Checking the number of mice in the DataFrame.
#mice_series = merged_df['Mouse ID']
#print(mice_series)


In [None]:
#mouse = merged_df['Mouse ID'][0]
#print(mouse)

In [None]:
#len(mice_series)

In [None]:
# How to get access to the index in the mice_series?

In [None]:
# mice_list = []
# for mouse in range(len(mice_series)):
#     #print(mouse)
#     mice_list.append(mice_series[mouse])
# print(mice_list)

In [None]:
# duplicate_mice_by_id = []
# for mouse in range(mouse_ID_len):
#     duplicate_mice_by_id.append([mouse])
# duplicate_mice_by_id

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.


In [None]:
# Checking the number of mice in the clean DataFrame.
#merged_df['Mouse ID'].count()

In [None]:
#len(merged_df['Mouse ID'].value_counts())

In [None]:
#merged_df['Mouse ID'].unique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straightforward, creating multiple series and putting them all together at the end.

In [None]:
# regimen_df = merged_df.set_index("Drug Regimen")
# regimen_df

In [None]:
# #regimen_summary = pd.DataFrame({"Mean": [regimen_df.mean()]})
# regimen_df = merged_df.groupby(['Drug Regimen'])
# regimen_df.mean()

In [None]:
# tumor_df = regimen_df['Tumor Volume (mm3)']
# volume_average = regimen_df['Tumor Volume (mm3)'].mean()
# volume_average.head()

In [None]:
regimen_df = merged_df[["Drug Regimen", "Tumor Volume (mm3)"]]
# average_volume = regimen_df["Tumor Volume (mm3)"].mean()
# median_volume = regimen_df["Tumor Volume (mm3)"].median()
# var_volume = regimen_df["Tumor Volume (mm3)"].var()
# std_volume = regimen_df["Tumor Volume (mm3)"].std()
# sem = regimen_df["Tumor Volume (mm3)"].sem()
# summary_df = pd.DataFrame({'Average Volume (mm3)': [average_volume],
#                            'Median Volume (mm3)': [median_volume],
#                            'Variance (mm3)': [var_volume],
#                            'Standard Deviation (mm3)': [std_volume],
#                            'Standard Error': [sem]})
# summary_df

In [None]:
regimen_df["Tumor Volume (mm3)"].describe()

In [None]:
#regimen_summary = pd.DataFrame({"Mean": [regimen_df.mean()]})
grouped_regimen_df = merged_df.groupby(['Drug Regimen'])
describe_df = grouped_regimen_df["Tumor Volume (mm3)"].describe()

In [None]:
# Part 2 - Generate a summary statistics table of mean, median, variance, standard deviation, \
        #and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function.

describe_df['Median'] = regimen_df["Tumor Volume (mm3)"].median()
describe_df['Standard Error'] = regimen_df["Tumor Volume (mm3)"].sem()
describe_df['Variance'] = regimen_df["Tumor Volume (mm3)"].var()

describe_df

In [None]:
mean_numpy = np.mean(merged_df)
print(f"The regimen mean is {mean_numpy}")

In [None]:
# Part 2 - Generate a summary statistics table of mean, median, variance, standard deviation, \
        #and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function.

## Bar Plots

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [None]:
bar_df = merged_df[["Mouse ID", "Timepoint"]]
bar_df

In [None]:
x_axis = np.arange(len(bar_df))
plt.bar(x_axis, bar_df["Timepoint"], color='r', alpha=0.75, align='center')
plt.title("Mice Count vs. Time Points")
plt.xlabel("Mouse Count")
plt.ylabel("Time Point")
plt.show()

In [None]:
#bar_df2 = merged_df.groupby[["Mouse ID", "Timepoint"]]


In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [None]:
bar_df.head()

In [None]:
plt.hist(bar_df)
plt.title("Mouse ID versus Time Point")
plt.xlabel("Mouse ID")
plt.ylabel("Time Point")
plt.show()

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
