# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [6]:
# Dependencies and Setup
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st


In [7]:
# Study data files
mouse_metadata_path = "Data/Mouse_metadata.csv"
study_results_path = "Data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
pyma_df= pd.merge(mouse_metadata, study_results)
# Display the data table for preview
pyma_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [8]:
pyma_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Mouse ID            1893 non-null   object 
 1   Drug Regimen        1893 non-null   object 
 2   Sex                 1893 non-null   object 
 3   Age_months          1893 non-null   int64  
 4   Weight (g)          1893 non-null   int64  
 5   Timepoint           1893 non-null   int64  
 6   Tumor Volume (mm3)  1893 non-null   float64
 7   Metastatic Sites    1893 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 133.1+ KB


In [9]:
# #dropping
# pyma_df_dropped_na = pyma_df.dropna()
# pyma_df_dropped_na







# Checking the number of mice.



In [10]:
#check number of mice
number_of_mice = len(mouse_metadata)
number_of_mice


249

In [11]:
#Find the duplicates from each data set

#Mouse
duplicates = mouse_metadata[mouse_metadata.duplicated()]
print(duplicates)

#Study
duplicates = study_results[study_results.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [Mouse ID, Drug Regimen, Sex, Age_months, Weight (g)]
Index: []
    Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
137     g989          0                45.0                 0


### Dropped data from study_results 

In [14]:
#Drop duplicate from mouse data
clean = study_results.drop(study_results.index[137])
clean
 
#Drop na if any left
clean.dropna()


#Number Of Mice after duplicate dropped
number_of_mice_clean = len(clean)
number_of_mice_clean

1892

In [15]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_id_number = pyma_df[pyma_df.duplicated(["Mouse ID", "Timepoint"])]
duplicate_id_number

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [16]:
# Optional: Get all the data for the duplicate mouse ID. 
print(duplicate_id_number)


    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
909     g989     Propriva  Female          21          26          0   
911     g989     Propriva  Female          21          26          5   
913     g989     Propriva  Female          21          26         10   
915     g989     Propriva  Female          21          26         15   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
909           45.000000                 0  
911           47.570392                 0  
913           49.880528                 0  
915           53.442020                 0  
917           54.657650                 1  


In [17]:
# Create a clean DataFrame 
pyma_df_clean = pd.merge(clean, mouse_metadata)
pyma_df_clean.dropna()
pyma_df_clean


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1887,m601,25,33.118756,1,Capomulin,Male,22,17
1888,m601,30,31.758275,1,Capomulin,Male,22,17
1889,m601,35,30.834357,1,Capomulin,Male,22,17
1890,m601,40,31.378045,1,Capomulin,Male,22,17


In [18]:
pyma_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1892 entries, 0 to 1891
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Mouse ID            1892 non-null   object 
 1   Timepoint           1892 non-null   int64  
 2   Tumor Volume (mm3)  1892 non-null   float64
 3   Metastatic Sites    1892 non-null   int64  
 4   Drug Regimen        1892 non-null   object 
 5   Sex                 1892 non-null   object 
 6   Age_months          1892 non-null   int64  
 7   Weight (g)          1892 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 133.0+ KB


## Summary Statistics

In [19]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_statistics = pyma_df_clean.describe()
summary_statistics


Unnamed: 0,Timepoint,Tumor Volume (mm3),Metastatic Sites,Age_months,Weight (g)
count,1892.0,1892.0,1892.0,1892.0,1892.0
mean,19.582452,50.45126,1.022199,12.810254,25.662262
std,14.075984,8.896191,1.138032,7.189027,3.922652
min,0.0,22.050126,0.0,1.0,15.0
25%,5.0,45.0,0.0,7.0,25.0
50%,20.0,48.954697,1.0,13.0,27.0
75%,30.0,56.298917,2.0,20.0,29.0
max,45.0,78.567014,4.0,24.0,30.0


In [20]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
#get the names of each drug regimen
drug_regimen = pyma_df_clean.groupby("Drug Regimen")
print(drug_regimen)

drug_regimen.describe()



<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fa24bca4090>


Unnamed: 0_level_0,Timepoint,Timepoint,Timepoint,Timepoint,Timepoint,Timepoint,Timepoint,Timepoint,Tumor Volume (mm3),Tumor Volume (mm3),...,Age_months,Age_months,Weight (g),Weight (g),Weight (g),Weight (g),Weight (g),Weight (g),Weight (g),Weight (g)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Capomulin,230.0,21.565217,14.384994,0.0,10.0,20.0,35.0,45.0,230.0,40.675741,...,20.0,24.0,230.0,19.965217,2.732404,15.0,17.0,20.5,22.0,25.0
Ceftamin,178.0,19.747191,14.283969,0.0,5.0,20.0,30.0,45.0,178.0,52.591172,...,20.0,24.0,178.0,27.398876,1.58146,25.0,26.0,28.0,28.0,30.0
Infubinol,178.0,18.174157,13.473473,0.0,5.0,15.0,30.0,45.0,178.0,52.884795,...,23.0,24.0,178.0,27.196629,2.18381,23.0,25.0,27.0,29.0,30.0
Ketapril,188.0,19.707447,14.029935,0.0,5.0,20.0,30.0,45.0,188.0,55.235638,...,19.0,24.0,188.0,27.861702,1.841884,25.0,26.0,28.0,30.0,30.0
Naftisol,186.0,19.623656,14.184814,0.0,5.0,20.0,30.0,45.0,186.0,54.331565,...,19.0,23.0,186.0,27.166667,1.499249,25.0,26.0,27.0,28.0,30.0
Placebo,181.0,18.674033,13.890798,0.0,5.0,15.0,30.0,45.0,181.0,54.033581,...,17.0,21.0,181.0,27.928177,1.837973,25.0,27.0,28.0,30.0,30.0
Propriva,160.0,16.96875,13.447977,0.0,5.0,15.0,25.0,45.0,160.0,52.368318,...,21.0,24.0,160.0,27.05,1.674072,25.0,26.0,26.0,29.0,30.0
Ramicane,228.0,21.425439,14.27572,0.0,10.0,20.0,35.0,45.0,228.0,40.216745,...,18.0,23.0,228.0,19.679825,3.235014,16.0,17.0,19.0,22.0,25.0
Stelasyn,181.0,19.226519,13.84271,0.0,5.0,20.0,30.0,45.0,181.0,54.233149,...,21.0,23.0,181.0,27.856354,1.643616,25.0,27.0,28.0,29.0,30.0
Zoniferol,182.0,19.368132,14.384679,0.0,5.0,15.0,30.0,45.0,182.0,53.236507,...,16.0,24.0,182.0,27.692308,1.419612,25.0,27.0,28.0,29.0,30.0


In [21]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:

# Group the data by the "Drug Regimen" column
grouped_data = pyma_df_clean.groupby("Drug Regimen")

# Calculate the mean, median, min, max, and standard deviation for each drug regimen
mean = grouped_data["Tumor Volume (mm3)"].mean()
median = grouped_data["Tumor Volume (mm3)"].median()
min = grouped_data["Tumor Volume (mm3)"].min()
max = grouped_data["Tumor Volume (mm3)"].max()
std = grouped_data["Tumor Volume (mm3)"].std()

# Combine the results into a single DataFrame
results = pd.concat([mean, median, min, max, std], axis=1)

# Rename the columns for clarity
results.columns = ["Mean", "Median", "Min", "Max", "Std Dev"]

# Display the results
print(results)


                   Mean     Median        Min        Max   Std Dev
Drug Regimen                                                      
Capomulin     40.675741  41.557809  23.343598  48.158209  4.994774
Ceftamin      52.591172  51.776157  45.000000  68.923185  6.268188
Infubinol     52.884795  51.820584  36.321346  72.226731  6.567243
Ketapril      55.235638  53.698743  45.000000  78.567014  8.279709
Naftisol      54.331565  52.509285  45.000000  76.668817  8.134708
Placebo       54.033581  52.288934  45.000000  73.212939  7.821003
Propriva      52.368318  50.909965  45.000000  72.455421  6.502160
Ramicane      40.216745  40.673236  22.050126  47.622816  4.846308
Stelasyn      54.233149  52.431737  45.000000  75.123690  7.710419
Zoniferol     53.236507  51.818479  45.000000  73.324432  6.966589


In [22]:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
grouped = pyma_df_clean.groupby("Drug Regimen")
tumor_vol = grouped["T_)"]
summary_stats = tumor_vol.describe()
summary_stats


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Capomulin,230.0,40.675741,4.994774,23.343598,37.685933,41.557809,45.0,48.158209
Ceftamin,178.0,52.591172,6.268188,45.0,47.208427,51.776157,56.801438,68.923185
Infubinol,178.0,52.884795,6.567243,36.321346,47.312353,51.820584,57.314444,72.226731
Ketapril,188.0,55.235638,8.279709,45.0,48.232987,53.698743,60.870951,78.567014
Naftisol,186.0,54.331565,8.134708,45.0,47.285874,52.509285,59.963034,76.668817
Placebo,181.0,54.033581,7.821003,45.0,47.459053,52.288934,59.916934,73.212939
Propriva,160.0,52.368318,6.50216,45.0,47.107256,50.909965,56.259803,72.455421
Ramicane,228.0,40.216745,4.846308,22.050126,36.674635,40.673236,45.0,47.622816
Stelasyn,181.0,54.233149,7.710419,45.0,48.047139,52.431737,58.719297,75.12369
Zoniferol,182.0,53.236507,6.966589,45.0,47.337876,51.818479,57.954259,73.324432


In [24]:
summary_stats = pd.DataFrame({
    "count" : tumor_vol.count(),
    "mean": tumor_vol.mean(),
    "std deviation": tumor_vol.std(),
    "median": tumor_vol.median(),
    "variance": tumor_vol.var(),
    "SEM": tumor_vol.sem()
})
summary_stats

Unnamed: 0_level_0,count,mean,median,variance,std deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capomulin,230,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,178,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,178,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,188,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,186,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,181,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,160,52.368318,50.909965,42.27809,6.50216,0.514041
Ramicane,228,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,181,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,182,53.236507,51.818479,48.533355,6.966589,0.516398


In [None]:
# Using the aggregation method, produce the same summary statistics in a single line

pyma_df_clean["Tumor Volume (mm3)"].agg(["mean", "median", "var", "std", "sem"])


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.


total_timepoints = pyma_df_clean.groupby("Drug Regimen").count()["Timepoint"].plot(kind='bar')
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Timepoints")
plt.title("Total Number of Timepoints for All Mice Tested for Each Drug Regimen")
plt.show()


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.

# Group the data by "Drug Regimen" and count the number of timepoints
drug_regimen = pyma_df_clean.groupby("Drug Regimen").count()["Timepoint"]

# Get the x-axis labels (drug regimens)
x_labels = np.array(drug_regimen.index)

# Get the y-axis values (number of timepoints)
y_values = np.array(drug_regimen.values)

# Create the bar plot
plt.bar(x_labels, y_values)
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Timepoints")
plt.title("Total Number of Timepoints for All Mice Tested for Each Drug Regimen")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

# Group the data by "Sex" and count the number of mice
sex_group = pyma_df_clean.groupby("Sex").count()["Mouse ID"]

# Plot the pie chart
sex_group.plot(kind='pie', autopct='%1.1f%%')
plt.axis('equal')  # Make the plot an equal circle
plt.legend(title="Sex")
plt.title("Distribution of Female vs Male Mice")
plt.show()



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Group the data by "Sex" and count the number of mice
sex_group = pyma_df_clean.groupby("Sex").count()["Mouse ID"]

# Get the labels and values for the pie chart
labels = np.array(sex_group.index)
values = np.array(sex_group.values)

# Plot the pie chart
plt.pie(values, labels=labels, autopct='%1.1f%%')
plt.axis('equal')  # Make the plot an equal circle
plt.legend(title="Sex")
plt.title("Distribution of Female vs Male Mice")
plt.show()





## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens: 

# Start by getting the last (greatest) timepoint for each mouse
max_timepoint = pyma_df_clean.groupby("Mouse ID").max()["Timepoint"]
max_timepoint.rename('Timepoint'== 'Max Timepoint', inplace=True)
max_timepoint


# # # Merge the max_timepoint DataFrame with the original DataFrame to get the tumor volume at the last timepoint
# pyma_df_clean['Max Timepoint'] = pyma_df_clean.groupby("Mouse ID")['Timepoint'].transform('max')
# pyma_df_clean



#pd.merge(max_timepoint)(pyma_df_clean)

#  Filter the merged DataFrame to include only the desired regimens

# regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
# final_tumor_df = merged_df[merged_df["Drug Regimen"].isin(regimens)]

# # Merge this group df with the original DataFrame to get the tumor volume at the last timepoint

# final_tumor_df

# # Get the final tumor volume for each mouse
# # final_tumor_volume = final_tumor_df.groupby("["Timepoint"]
# # final_tumor_volume


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers.
tumor_vol_data = pyma_df_clean["Tumor Volume (mm3)"]
quartiles = tumor_vol_data.describe()
lower_q = quartiles["25%"]
upper_q = quartiles["75%"]
iqr = upper_q - lower_q


print(f"The lower quartile occupancy is: {lower_q}")
print(f"The upper quartile occupancy is: {upper_q}")
print(f"The interquartile range of occupancy is: {iqr}")
#print(f"The median of occupancy is {quartiles[0.5]}")


lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values below {upper_bound} could be outliers.")


for regimen in regimens:
    regimen_df = pyma_df[pyma_df["Drug Regimen"] == regimen]
    regimen_tumor_volumes = regimen_df["Tumor Volume (mm3)"]
    
# do something with the tumor volumes




 # Determine outliers using upper and lower bounds\
outliers = pyma_df[(pyma_df["Tumor Volume (mm3)"] < lower_bound) | 
                   (pyma_df["Tumor Volume (mm3)"] > upper_bound)]


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.

fig1, ax1 = plt.subplots()
ax1.set_title('Tumor Volume by Treatment Regimen')
ax1.set_ylabel('Tumor Volume (mm3)')

data = [pyma_df_clean[pyma_df_clean
                      
                      ['Drug Regimen'] == regimen]['Tumor Volume (mm3)'] for regimen in regimens]
ax1.boxplot(data, labels=regimens)
plt.show()


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

# choose the first mouse treated with Capomulin
mouse = pyma_df_clean[pyma_df_clean['Drug Regimen'] == 'Capomulin'].iloc[0,0] 
# get data for the chosen mouse
mouse_df = pyma_df_clean[pyma_df_clean['Mouse ID'] == mouse] 

plt.plot(mouse_df['Timepoint'], mouse_df['Tumor Volume (mm3)'], '-o')
plt.title('Tumor Volume over Time for Mouse ' + mouse)
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.xticks(np.arange(0, mouse_df['Timepoint'].max()+5, 5))
plt.show()


In [None]:

# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


capomulin_df = pyma_df_clean[pyma_df_clean['Drug Regimen'] == 'Capomulin']
plt.scatter(capomulin_df['Tumor Volume (mm3)'], capomulin_df['Weight (g)'])
plt.title("Tumor Volume vs Weight of Mouse on Capomulin Regimen")
plt.xlabel('Avg Tumor Volume (mm3)')
plt.ylabel('Mouse Weight (g)')
plt.show()



In [None]:
capomulin_df

## Correlation and Regression

In [None]:
# Filter the data to only include the Capomulin regimen and find the mean
mean_tumor_volume = capomulin_df.groupby(['Mouse ID'])["Tumor Volume (mm3)"].mean().reset_index()

# Add the average tumor volume to the dataframe
capomulin_final = pd.merge(capomulin_df, mean_tumor_volume, on='Mouse ID', how='left')

# Compute the correlation between mouse weight and average tumor volume
correlation = capomulin_final["Weight (g)"].corr(capomulin_final["Tumor Volume (mm3)_y"])

print("The correlation coefficient between mouse weight and average tumor volume is:", correlation)


In [None]:
capomulin_final = capomulin_df.loc['Mean Tumor Volume (mm3)'] = capomulin_df.groupby(['Mouse ID'])["Tumor Volume (mm3)"].transform('mean')
result = capomulin_df.dropna()
#Final DataFrame
result


In [None]:
pyma_df.dropna()

In [None]:
# Plot the relationship between mouse weight and average tumor volume

# Plot the relationship between mouse weight and average tumor volume
plt.plot(capomulin_final['Weight (g)'], capomulin_final['Mean Tumor Volume (mm3)'], 'o')
plt.xlabel('Weight (g)')
plt.ylabel('Mean Tumor Volume (mm3)')
plt.title('Mouse weight vs. average tumor volume for the Capomulin regimen')
plt.show()

# Fit a linear regression model to the data
slope, intercept = np.polyfit(capomulin_final['Weight (g)'], capomulin_final['Mean Tumor Volume (mm3)'], 1)









# plt.plot(capomulin['Weight (g)'], capomulin['Mean Tumor Volume (mm3)'], '0')
# plt.xlabel('Weight (g)')
# plt.ylabel('Mean Tumor Volume (mm3)')
# plt.title('Mouse weight vs. average tumor volume for the Capomulin regimen')
# plt.show()

# # Fit a linear regression model to the data
# slope, intercept = np.polyfit(capomulin['Weight (g)'], capomulin['Mean Tumor Volume (mm3)'], 1)

# Create a function for the line of best fit
def line_of_best_fit(x):
  return slope * x + intercept

# Plot the relationship between mouse weight and average tumor volume with the line of best fit
plt.plot(capomulin_final['Weight (g)'], capomulin_final['Mean Tumor Volume (mm3)'],'o')
x = np.linspace(capomulin_final['Weight (g)'].min(), capomulin_final['Weight (g)'].max(), 100)
y = line_of_best_fit(x)
plt.plot(x, y, color='red')
plt.xlabel('Weight (g)')
plt.ylabel('Mean Tumor Volume (mm3)')
plt.title('Mouse weight vs. average tumor volume for the Capomulin regimen with line of best fit')
plt.show()


In [None]:
#linear regression
slope, intercept = np.polyfit(capomulin['Weight (g)'], capomulin['Mean Tumor Volume (mm3)'], 1)

#Create function for line of best fit
def line_of_best_fit(x):
    return slope * x + intercept


#Plot the relationship between mouse weight and average tumor volume with the line of best fit


plt.scatter(x=capomulin['Weight (g)'], y=capomulin['Mean Tumor Volume (mm3)'])
x = np.linspace(capomulin['Weight (g)'].min(), capomulin['Weight (g)'].max(), 100)
y = line_of_best_fit(x)
plt.plot(x, y, color='red')
plt.xlabel('Weight (g)')
plt.ylabel('Mean Tumor Volume (mm3)')
plt.title('Mouse weight vs. average tumor volume for the Capomulin regimen with line of best fit')
plt.show()