## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

mouse_df = pd.DataFrame(mouse_metadata)
mouse_df

results_df = pd.DataFrame(study_results)
results_df

merge_df = pd.merge(mouse_df, results_df, on="Mouse ID")
merge_df.head(100)
merge_df


FileNotFoundError: [Errno 2] File data/Mouse_metadata.csv does not exist: 'data/Mouse_metadata.csv'

In [None]:
# Checking the number of mice in the DataFrame.
len(merge_df["Mouse ID"].unique())


In [2]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
merge_df.duplicated(subset = ["Mouse ID","Timepoint"])
merge_df[merge_df.duplicated(subset=["Mouse ID", "Timepoint"])]                              
                               

NameError: name 'merge_df' is not defined

In [None]:
new_merge_df = merge_df[merge_df["Mouse ID"] != "g989"]
new_merge_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
new_merge_df.drop_duplicates(subset = ["Mouse ID"], keep = 'last', inplace = False)   

clean_df = new_merge_df.drop_duplicates(subset = ["Mouse ID"], keep = 'last', inplace = False)
clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_df)

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

std = clean_df.groupby('Drug Regimen').std()['Tumor Volume (mm3)'] 

var = clean_df.groupby('Drug Regimen').median()['Tumor Volume (mm3)']

mean= clean_df.groupby('Drug Regimen').mean()['Tumor Volume (mm3)']

median = clean_df.groupby('Drug Regimen').median()['Tumor Volume (mm3)']

sem = clean_df.groupby('Drug Regimen').sem()['Tumor Volume (mm3)']

stat_table_df = pd.DataFrame({"Mean":mean, "Median": median, "Variance": var, "STD": std, "SEM": sem})
stat_table_df

# This method is the most straightforward, creating multiple series and putting them all together at the end.

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.

function = 'mean', 'median', 'var', 'std', 'sem'

agg_df = clean_df.groupby('Drug Regimen').agg(function)['Tumor Volume (mm3)']
agg_df

## Bar Plots

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study 
#using pandas.

barplot = new_merge_df[["Mouse ID", "Timepoint"]]
barplot1 = barplot.groupby("Timepoint")
barplot2 = barplot1.count()



data = {"Timepoint": [0,5,10,15,20,25,30,35,40,45],

        "Number of Mice" :[248,236,221,205,193,182,170,153,142,130]

        };

# Dictionary loaded into a DataFrame       
dataFrame = pd.DataFrame(data = data)
# Draw a vertical bar chart
dataFrame.plot.bar(x="Timepoint", y="Number of Mice", title="Number of Mice per Treatment");

In [3]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study 
#using pyplot.

barplot = new_merge_df[["Mouse ID", "Timepoint"]]
barplot1 = barplot.groupby("Timepoint")
barplot2 = barplot1.count()

barplot2.plot(kind="bar")

plt.title = ("Number of Mice Per Treatment")
plt.xlabel = ("Timepoint")
plt.ylabel = ("Number of Mice")


NameError: name 'new_merge_df' is not defined

## Pie Plots

In [4]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

gender = clean_df["Sex"]
gender = gender.value_counts()
male = gender["Male"]
female = gender["Female"]

data = {'Gender': [male, female]}
df = pd.DataFrame(data,columns=['Gender'])

my_labels = 'Male','Female'

fig, ax = plt.subplots()
ax.pie(df,labels=my_labels,autopct='%1.1f%%')
# plt.title('Distribution of Female versus Male Mice')
# plt.axis('equal')
ax.set(aspect="equal", title='Distribution of Female versus Male Mice')
plt.show()

NameError: name 'clean_df' is not defined

In [5]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot


my_data = [male, female]
my_labels = 'Male','Female'
plt.pie(my_data,labels=my_labels,autopct='%1.1f%%')
plt.title('Distribution of Female versus Male Mice')
plt.axis('equal')
plt.show()

NameError: name 'male' is not defined

## Quartiles, Outliers and Boxplots

In [6]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
ramicane = clean_df.loc[clean_df["Drug Regimen"] == "Ramicane", :]
capomulin1 = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin", :]
infubinol = clean_df.loc[clean_df["Drug Regimen"] == "Infubinol", :]
ceftamin = clean_df.loc[clean_df["Drug Regimen"] == "Ceftamin", :]

ramicane = ramicane["Tumor Volume (mm3)"]
capomulin= capomulin1["Tumor Volume (mm3)"]
infubinol= infubinol["Tumor Volume (mm3)"]
ceftamin= ceftamin["Tumor Volume (mm3)"]

ram_df= pd.DataFrame(ramicane)
cap_df= pd.DataFrame(capomulin)
inf_df= pd.DataFrame(infubinol)
cef_df= pd.DataFrame(ceftamin)


NameError: name 'clean_df' is not defined

In [7]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

drug_box = [ramicane, capomulin, infubinol, ceftamin]
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume of Mice for \nTop Treatment Regimens')
ax1.set_xlabel('Drug Regimen')
ax1.set_ylabel('Tumor Volume (mm3)')
ax1.boxplot(drug_box)
plt.show()

NameError: name 'ramicane' is not defined

## Line and Scatter Plots

In [8]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
s185 = new_merge_df.loc[new_merge_df["Mouse ID"] == "s185",:]

x_axis = s185["Timepoint"]
y_axis = s185["Tumor Volume (mm3)"]

plt.plot(x_axis, y_axis)

plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")

NameError: name 'new_merge_df' is not defined

In [1]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

weight = s185.iloc[:,"Weight"]
violent_crime_rate = crime_data.iloc[:,3]
plt.scatter(year,violent_crime_rate)
plt.xticks(year, rotation=90)
plt.xlabel('Year')
plt.ylabel('Violent Crime Rate')
plt.show()

NameError: name 's185' is not defined

## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen


In [19]:
''

''