Some typical NumPy 'random' functions:

In [None]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

# Start time for Pandas
start_pandas = time.time()

# Generate random data using Pandas
pandas_data = pd.Series(np.random.randint(1, 101, size=1000000))

# End time for Pandas
end_pandas = time.time()

# Time taken for Pandas
time_pandas = end_pandas - start_pandas
print("Time taken using Pandas:", time_pandas)

In [None]:
# Start time for NumPy
start_numpy = time.time()

# Generate random data using NumPy
numpy_data = np.random.randint(1, 101, size=1000000)

# End time for NumPy
end_numpy = time.time()

# Time taken for NumPy
time_numpy = end_numpy - start_numpy
print("Time taken using NumPy:", time_numpy)

#NumPy is usually faster than Pandas in numerical operations because it runs directly on lower level isomorphic arrays, while Pandas provides additional functionality and flexibility, which increases some overhead.

AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
import numpy as np

# Loading the CSV file as an array in NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=float, encoding='utf-8')

# Create separate arrays for the individual columns
sleep_duration = data[:, 4]
systolic_bp = data[:, 9]
diastolic_bp = data[:, 10]
heart_rate = data[:, 11]
daily_steps = data[:, 12]
age = data[:, 2]

# Calculate the correlation in NumPy using np.corrcoef()
corr_sleep_age = np.corrcoef(sleep_duration, age)[0, 1]
corr_sleep_heart_rate = np.corrcoef(sleep_duration, heart_rate)[0, 1]
corr_sleep_daily_steps = np.corrcoef(sleep_duration, daily_steps)[0, 1]

print("NumPy Correlations:")
print("Sleep Duration and Age:", corr_sleep_age)
print("Sleep Duration and Heart Rate:", corr_sleep_heart_rate)
print("Sleep Duration and Daily Steps:", corr_sleep_daily_steps)

# Use np.std() to calculate Standard deviation
std_sleep_duration = np.std(sleep_duration)
print("NumPy Standard Deviation of Sleep Duration:", std_sleep_duration)

AIM #3: Use suitable plots to visualize the data

1. Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for
    1.1. Age
    1.2. Sleep Duration
    1.3. Quality of Sleep
    1.4. Physical Activity Level
    1.5. Stress Level
    1.6. Heart Rate
2. Using only NumPy, do the same as Step 1. You will need matplotlib for this
3. Using only pandas, use the appropriate plot to
    3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
    3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
    3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
    3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
    3.5. See the distribution of 'Sleep Duration' based on 'BMI'
4. Using only NumPy, do the same as Step 3. You will need matplotlib for this
5. Using only pandas, use a suitable plot to see the relation between
    5.1. Age and Sleep Duration
    5.2. Sleep Duration and Heart Rate
    5.3. Heart Rate and Daily Steps
    5.4. Sleep Duration and Daily Steps
6. Using only NumPy, do the same as Step 5. You will need matplotlib for this 
7. Find the time difference between plotting using only pandas, and plotting using NumPy

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Load the dataset
df = pd.read_csv('sleep_health.csv')

# 1.1 Age distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

# 1.2 Sleep Duration distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Sleep Duration'], bins=30, kde=True)
plt.title('Sleep Duration Distribution')
plt.show()

# 1.3 Quality of Sleep distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Quality of Sleep'], bins=30, kde=True)
plt.title('Quality of Sleep Distribution')
plt.show()

# 1.4 Physical Activity Level distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Physical Activity Level'], bins=30, kde=True)
plt.title('Physical Activity Level Distribution')
plt.show()

# 1.5 Stress Level distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Stress Level'], bins=30, kde=True)
plt.title('Stress Level Distribution')
plt.show()

# 1.6 Heart Rate distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Heart Rate'], bins=30, kde=True)
plt.title('Heart Rate Distribution')
plt.show()

In [None]:
import numpy as np

# Load data using NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1)

# Plot distributions using NumPy
variables = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate']
indices = [2, 4, 5, 6, 7, 11]

for i, variable in zip(indices, variables):
    plt.figure(figsize=(10, 6))
    plt.hist(data[:, i], bins=30, alpha=0.7, label=variable)
    plt.title(f'{variable} Distribution')
    plt.xlabel(variable)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

In [None]:
# 3.1 Sleep Duration based on Quality of Sleep
plt.figure(figsize=(10, 6))
sns.boxplot(x='Quality of Sleep', y='Sleep Duration', data=df)
plt.title('Sleep Duration based on Quality of Sleep')
plt.show()

# 3.2 Sleep Duration based on Stress Level
plt.figure(figsize=(10, 6))
sns.boxplot(x='Stress Level', y='Sleep Duration', data=df)
plt.title('Sleep Duration based on Stress Level')
plt.show()

# 3.3 Sleep Duration based on Physical Activity Level
plt.figure(figsize=(10, 6))
sns.boxplot(x='Physical Activity Level', y='Sleep Duration', data=df)
plt.title('Sleep Duration based on Physical Activity Level')
plt.show()

# 3.4 Sleep Duration based on Occupation
plt.figure(figsize=(14, 6))
sns.boxplot(x='Occupation', y='Sleep Duration', data=df)
plt.xticks(rotation=45)
plt.title('Sleep Duration based on Occupation')
plt.show()

# 3.5 Sleep Duration based on BMI
plt.figure(figsize=(10, 6))
sns.boxplot(x='BMI Category', y='Sleep Duration', data=df)
plt.title('Sleep Duration based on BMI')
plt.show()

In [None]:
# Group data for box plots based on categories
def plot_box(variable_idx, group_idx, group_labels, title):
    variable = data[:, variable_idx]
    group = data[:, group_idx]
    variable_list = [variable[group == label] for label in np.unique(group)]
    plt.figure(figsize=(10, 6))
    plt.boxplot(variable_list, labels=group_labels)
    plt.title(title)
    plt.show()

# 4.1 Sleep Duration based on Quality of Sleep
plot_box(4, 5, np.unique(data[:, 5]), 'Sleep Duration based on Quality of Sleep')

# 4.2 Sleep Duration based on Stress Level
plot_box(4, 7, np.unique(data[:, 7]), 'Sleep Duration based on Stress Level')

# 4.3 Sleep Duration based on Physical Activity Level
plot_box(4, 6, np.unique(data[:, 6]), 'Sleep Duration based on Physical Activity Level')

In [None]:
# 5.1 Age and Sleep Duration
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Sleep Duration', data=df)
plt.title('Age vs Sleep Duration')
plt.show()

# 5.2 Sleep Duration and Heart Rate
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Sleep Duration', y='Heart Rate', data=df)
plt.title('Sleep Duration vs Heart Rate')
plt.show()

# 5.3 Heart Rate and Daily Steps
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Heart Rate', y='Daily Steps', data=df)
plt.title('Heart Rate vs Daily Steps')
plt.show()

# 5.4 Sleep Duration and Daily Steps
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Sleep Duration', y='Daily Steps', data=df)
plt.title('Sleep Duration vs Daily Steps')
plt.show()

In [None]:
# 6.1 Age and Sleep Duration
plt.figure(figsize=(10, 6))
plt.scatter(data[:, 2], data[:, 4], alpha=0.5)
plt.title('Age vs Sleep Duration')
plt.xlabel('Age')
plt.ylabel('Sleep Duration')
plt.show()

# 6.2 Sleep Duration and Heart Rate
plt.figure(figsize=(10, 6))
plt.scatter(data[:, 4], data[:, 11], alpha=0.5)
plt.title('Sleep Duration vs Heart Rate')
plt.xlabel('Sleep Duration')
plt.ylabel('Heart Rate')
plt.show()

# 6.3 Heart Rate and Daily Steps
plt.figure(figsize=(10, 6))
plt.scatter(data[:, 11], data[:, 12], alpha=0.5)
plt.title('Heart Rate vs Daily Steps')
plt.xlabel('Heart Rate')
plt.ylabel('Daily Steps')
plt.show()

# 6.4 Sleep Duration and Daily Steps
plt.figure(figsize=(10, 6))
plt.scatter(data[:, 4], data[:, 12], alpha=0.5)
plt.title('Sleep Duration vs Daily Steps')
plt.xlabel('Sleep Duration')
plt.ylabel('Daily Steps')
plt.show()

In [None]:
# Timing using Pandas
start_pandas = time.time()
# Execute Pandas plotting code here
end_pandas = time.time()
time_pandas = end_pandas - start_pandas

# Timing using NumPy
start_numpy = time.time()
# Execute NumPy plotting code here
end_numpy = time.time()
time_numpy = end_numpy - start_numpy

print("Time taken using Pandas:", time_pandas)
print("Time taken using NumPy:", time_numpy)

# Pandas offers more functionality and convenience for data manipulation, which can add overhead compared to NumPy's lower-level operations.

AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('sleep_health.csv')

# Pairplot to visualize pairwise relationships
sns.pairplot(df, vars=['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level'])
plt.suptitle('Pairwise Relationships', y=1.02)
plt.show()

# Heatmap for correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# Violin plot for Sleep Duration across BMI categories
plt.figure(figsize=(10, 6))
sns.violinplot(x='BMI Category', y='Sleep Duration', data=df)
plt.title('Sleep Duration across BMI Categories')
plt.show()

# Line plot for trends in Heart Rate by Age
plt.figure(figsize=(10, 6))
sns.lineplot(x='Age', y='Heart Rate', data=df)
plt.title('Trend of Heart Rate by Age')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Load data using NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1)

# 2D Histogram for Sleep Duration and Physical Activity Level
plt.figure(figsize=(10, 6))
plt.hist2d(data[:, 4], data[:, 6], bins=[30, 30], cmap='Blues')
plt.colorbar(label='Frequency')
plt.xlabel('Sleep Duration')
plt.ylabel('Physical Activity Level')
plt.title('2D Histogram of Sleep Duration vs Physical Activity Level')
plt.show()

# Box plot for Heart Rate by Age group
age_groups = np.digitize(data[:, 2], bins=[20, 30, 40, 50, 60])
hr_by_age_group = [data[age_groups == i, 11] for i in range(1, 6)]
plt.figure(figsize=(10, 6))
plt.boxplot(hr_by_age_group, labels=['20-30', '30-40', '40-50', '50-60', '60+'])
plt.title('Heart Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Heart Rate')
plt.show()

# Scatter plot for Sleep Duration and Stress Level
plt.figure(figsize=(10, 6))
plt.scatter(data[:, 4], data[:, 7], alpha=0.5)
plt.title('Sleep Duration vs Stress Level')
plt.xlabel('Sleep Duration')
plt.ylabel('Stress Level')
plt.show()