In [None]:
# Importing Data into Colab (Approach 1) -- Local Drive
# You can also import data from your local hard drive using the following:

from google.colab import files
uploaded = files.upload()

import pandas as pd
import io

# Reading in the CSV file
data = pd.read_csv(io.BytesIO(uploaded[list(uploaded.keys())[0]]))  # Adjust file name if needed

# For large data sets, you may consider using:
print(data)           # prints the first 5 rows
print(data.head(10))  # prints the first n rows
print(data.tail(10))  # prints the last n rows

# Calculate the average of column: 'radius_mean' for each biopsy category:
#  malignant 'M' and benign 'B'
print(data['radius_mean'].loc[data['diagnosis'] == 'M'].mean())
print(data['radius_mean'].loc[data['diagnosis'] == 'B'].mean())

# Compare radius_mean for different diagnosis classes
d_groups = data.groupby('diagnosis')
mean_radius_by_d = d_groups['radius_mean'].mean()

print(mean_radius_by_d)

In [None]:
# Importing Data into Colab (Approach 2) -- Google Drive
# You can mount your google drive to access data files manually using the
# 'Files' icon on the left sidebar and clicking the 'Mount Drive' icon

# Or, you can mount your google drive using the following commands:
from google.colab import drive
drive.mount('/content/drive')

# Read in your CSV data file from your google drive folder
import pandas as pd
data=pd.read_csv('drive/MyDrive/BMEn5910/Lecture01/breast-cancer.csv')

# Avoid this for large data sets, but one can visualize data using:
print(data.to_string())

# Compare radius_mean for different diagnosis classes
diagnosis_groups = data.groupby('diagnosis')
mean_radius_by_diagnosis = diagnosis_groups['radius_mean'].mean()

print(mean_radius_by_diagnosis)

In [None]:
# Importing Data into Colab (Approach 3) -- GitHub Repository
# You can clone data from a GitHub repository

# Clone a GitHub Repository
!git clone https://github.com/BMEn-datascience/Lecture01.git

# Note that the repository is shown under the files icon in the left sidebar
# You can also view the folders using the command: !ls
!ls

# Let's import one of the .csv files
import pandas as pd
data = pd.read_csv('Lecture01/breast-cancer.csv')

# Create a violin plot comparing 'radius_mean' data for 'M' and 'B' categories
import seaborn as sns
sns.set_theme()
sns.violinplot(x='diagnosis', y='radius_mean', data=data)

# Calculate a t-test between 'radius_mean' data for 'M' and 'B' categories
from scipy import stats
stats.ttest_ind(data['radius_mean'].loc[data['diagnosis'] == 'B'],
                data['radius_mean'].loc[data['diagnosis'] == 'M'])