Some typical NumPy 'random' functions:

In [None]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [None]:
#code for AIM #1 .1
import pandas as pd
import numpy as np
import time

start_time = time.time()
df = pd.DataFrame(np.random.randint(1, 101, size=(1000000, 1)), columns=['RandomNumbers'])
pandas_time = time.time() - start_time
print(f"Pandas time: {pandas_time} seconds")



In [None]:
#code for AIM #1 .2
start_time = time.time()
np_array = np.random.randint(1, 101, size=1000000)
numpy_time = time.time() - start_time
print(f"NumPy time: {numpy_time} seconds")


In [None]:
#AIM #1 .3
Generally, NumPy will be faster than Pandas for large datasets because NumPy operations are implemented in C and optimized for performance, while Pandas adds additional overhead for DataFrame structures.

AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
#code for AIM #2 .1
import pandas as pd
import time

# Load dataset
start_time = time.time()
df = pd.read_csv('sleep_health.csv')

# Calculate means
mean_values = df[['Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate', 'Daily Steps']].mean()
pandas_mean_time = time.time() - start_time
print("Pandas Mean:\n", mean_values)
print(f"Pandas Mean calculation time: {pandas_mean_time} seconds")



In [None]:
#code for AIM #2 .2
import numpy as np

# Load dataset
start_time = time.time()
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

# Calculate means
mean_values_np = np.mean([data['Sleep_Duration'], data['Systolic_Blood_Pressure'], data['Diastolic_Blood_Pressure'], data['Heart_Rate'], data['Daily_Steps']], axis=1)
numpy_mean_time = time.time() - start_time
print("NumPy Mean:\n", mean_values_np)
print(f"NumPy Mean calculation time: {numpy_mean_time} seconds")


In [None]:
#code for AIM #2 .3
# Correlation
correlation_matrix = df.corr()
sleep_age = df['Sleep Duration'].corr(df['Age'])
sleep_heart_rate = df['Sleep Duration'].corr(df['Heart Rate'])
sleep_daily_steps = df['Sleep Duration'].corr(df['Daily Steps'])

print("Correlation Matrix:\n", correlation_matrix)
print(f"Sleep Duration and Age correlation: {sleep_age}")
print(f"Sleep Duration and Heart Rate correlation: {sleep_heart_rate}")
print(f"Sleep Duration and Daily Steps correlation: {sleep_daily_steps}")


In [None]:
#code for AIM #2 .4
# Correlation using NumPy
corr_np = np.corrcoef(data['Sleep_Duration'], data['Age'])[0, 1]
corr_np_heart_rate = np.corrcoef(data['Sleep_Duration'], data['Heart_Rate'])[0, 1]
corr_np_daily_steps = np.corrcoef(data['Sleep_Duration'], data['Daily_Steps'])[0, 1]

print(f"NumPy Sleep Duration and Age correlation: {corr_np}")
print(f"NumPy Sleep Duration and Heart Rate correlation: {corr_np_heart_rate}")
print(f"NumPy Sleep Duration and Daily Steps correlation: {corr_np_daily_steps}")


In [None]:
#code for AIM #2 .5
# Standard Deviation using Pandas
std_dev_pandas = df['Sleep Duration'].std()
print(f"Standard Deviation (Pandas): {std_dev_pandas}")


In [None]:
#code for AIM #2 .6
# Standard Deviation using NumPy
std_dev_numpy = np.std(data['Sleep_Duration'], ddof=1)  # Using ddof=1 for sample std deviation
print(f"Standard Deviation (NumPy): {std_dev_numpy}")


In [None]:
#code for AIM #2 .7
# Overall timing for both
start_time_pandas = time.time()
df = pd.read_csv('sleep_health.csv')
df[['Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate', 'Daily Steps']].std()
total_pandas_time = time.time() - start_time_pandas

start_time_numpy = time.time()
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)
np.std(data['Sleep_Duration'], ddof=1)
total_numpy_time = time.time() - start_time_numpy

print(f"Total Pandas time: {total_pandas_time} seconds")
print(f"Total NumPy time: {total_numpy_time} seconds")


AIM #3: Use suitable plots to visualize the data

Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for 1.1. Age 1.2. Sleep Duration 1.3. Quality of Sleep 1.4. Physical Activity Level 1.5. Stress Level 1.6. Heart Rate
Using only NumPy, do the same as Step 1. You will need matplotlib for this
Using only pandas, use the appropriate plot to 3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep' 3.2. See the distribution of 'Sleep Duration' based on 'Stress Level' 3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level' 3.4. See the distribution of 'Sleep Duration' based on 'Occupation' 3.5. See the distribution of 'Sleep Duration' based on 'BMI'
Using only NumPy, do the same as Step 3. You will need matplotlib for this
Using only pandas, use a suitable plot to see the relation between 5.1. Age and Sleep Duration 5.2. Sleep Duration and Heart Rate 5.3. Heart Rate and Daily Steps 5.4. Sleep Duration and Daily Steps
Using only NumPy, do the same as Step 5. You will need matplotlib for this
Find the time difference between plotting using only pandas, and plotting using NumPy


In [None]:
#code for AIM #3 .1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('sleep_health.csv')

# Distribution plots
features = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate']
for feature in features:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
#code for AIM #3 .2
import numpy as np

# Load dataset
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

# Distribution plots
features_np = ['Age', 'Sleep_Duration', 'Quality_of_Sleep', 'Physical_Activity_Level', 'Stress_Level', 'Heart_Rate']
for feature in features_np:
    plt.figure(figsize=(10, 6))
    plt.hist(data[feature], bins=30, alpha=0.7, color='blue')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
#code for AIM #3 .3
# Grouped distribution plots
grouped_features = ['Quality of Sleep', 'Stress Level', 'Physical Activity Level', 'Occupation', 'BMI']
for feature in grouped_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[feature], y=df['Sleep Duration'])
    plt.title(f'Sleep Duration by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Sleep Duration')
    plt.show()


In [None]:
#code for AIM #3 .4
for feature in grouped_features:
    unique_values = np.unique(data[feature])
    sleep_duration_list = [data['Sleep_Duration'][data[feature] == val] for val in unique_values]
    
    plt.figure(figsize=(10, 6))
    plt.boxplot(sleep_duration_list, labels=unique_values)
    plt.title(f'Sleep Duration by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Sleep Duration')
    plt.show()


In [None]:
#code for AIM #3 .5
relationship_features = [('Age', 'Sleep Duration'), 
                         ('Sleep Duration', 'Heart Rate'), 
                         ('Heart Rate', 'Daily Steps'), 
                         ('Sleep Duration', 'Daily Steps')]

for x_feature, y_feature in relationship_features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=x_feature, y=y_feature)
    plt.title(f'Relationship between {x_feature} and {y_feature}')
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.show()


In [None]:
#code for AIM #3 .6
for x_feature, y_feature in relationship_features:
    plt.figure(figsize=(10, 6))
    plt.scatter(data[x_feature], data[y_feature], alpha=0.5)
    plt.title(f'Relationship between {x_feature} and {y_feature}')
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.show()


In [None]:
#code for AIM #3 .7
# Timing for Pandas
import time

start_time_pandas = time.time()
# (Include the Pandas plotting code here)
total_pandas_time = time.time() - start_time_pandas

# Timing for NumPy
start_time_numpy = time.time()
# (Include the NumPy plotting code here)
total_numpy_time = time.time() - start_time_numpy

print(f"Total Pandas plotting time: {total_pandas_time} seconds")
print(f"Total NumPy plotting time: {total_numpy_time} seconds")


AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy



In [None]:
Pair Plots: Use sns.pairplot() in Pandas for visualizing relationships among multiple variables.
Heatmaps: Use sns.heatmap() for visualizing correlation matrices.
Violin Plots: Use sns.violinplot() to show distributions across categories.