<a href="https://colab.research.google.com/github/NandiniBasdwar7/concise/blob/main/day7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 #Simulate a basic ETL pipeline for IoT data using NumPy
import numpy as np
df=np.genfromtxt('/content/iotData.csv',delimiter=',',skip_header=1)
print(df)

[[ 26.  40. 450.]
 [ 32.  45. 390.]
 [ 31.  41. 428.]
 [ 28.  36. 471.]
 [ 36.  43. 398.]
 [ 37.  44. 400.]
 [ 29.  39. 415.]
 [ 15.  48. 444.]]


In [None]:
a1 = np.array([[31,np.nan, np.nan]])

In [None]:
np.concatenate((df,a1))

array([[ 26.,  40., 450.],
       [ 32.,  45., 390.],
       [ 31.,  41., 428.],
       [ 28.,  36., 471.],
       [ 36.,  43., 398.],
       [ 37.,  44., 400.],
       [ 29.,  39., 415.],
       [ 15.,  48., 444.],
       [ 31.,  nan,  nan]])

In [None]:
print(df.shape)

(8, 3)


In [None]:
print(np.isnan(df))

[[False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]
 [False False False]]


In [None]:
print(np.sum(np.isnan(df)))

0


In [None]:
median_value = np.nanmedian(df)
print(median_value)

42.0


In [None]:
df[np.isnan(df)] = median_value
print("\nArray after replacing NaNs with median:")
print(df)


Array after replacing NaNs with median:
[[ 26.  40. 450.]
 [ 32.  45. 390.]
 [ 31.  41. 428.]
 [ 28.  36. 471.]
 [ 36.  43. 398.]
 [ 37.  44. 400.]
 [ 29.  39. 415.]
 [ 15.  48. 444.]]


In [None]:
max_per_column = np.max(df, axis=0)
min_per_column = np.min(df, axis=0)

print("--- Max and Min Per Column ---")
print(f"Maximum values per column: {max_per_column}")
print(f"Minimum values per column: {min_per_column}")
print("\n")

# 3. Find max and min for each row (axis=1) - less common for this type of data, but useful to know
max_per_row = np.max(df, axis=1)
min_per_row = np.min(df, axis=1)

print("--- Max and Min Per Row ---")
print(f"Maximum values per row: {max_per_row}")
print(f"Minimum values per row: {min_per_row}")

--- Max and Min Per Column ---
Maximum values per column: [ 37.  48. 471.]
Minimum values per column: [ 15.  36. 390.]


--- Max and Min Per Row ---
Maximum values per row: [450. 390. 428. 471. 398. 400. 415. 444.]
Minimum values per row: [26. 32. 31. 28. 36. 37. 29. 15.]


In [None]:
import numpy as np

# --- 1. Simulate IoT Sensor Data Generation ---

# Set a starting timestamp
start_timestamp = np.datetime64('2025-07-15T18:00:00')

# Define the data type for the structured array
# 'datetime64[s]' for timestamps in seconds
# 'f8' for float64 for temperature and humidity
# 'bool' for motion detected (True/False)
sensor_data_dtype = [
    ('timestamp', 'datetime64[s]'),
    ('temperature_celsius', 'f8'),
    ('humidity_percent', 'f8'),
    ('motion_detected', 'bool')
]

# Create an empty structured array for 1000 entries
num_entries = 1000
iot_data = np.empty(num_entries, dtype=sensor_data_dtype)

# Populate the structured array with simulated data
for i in range(num_entries):
    # Generate sequential timestamps, 1 minute apart
    iot_data['timestamp'][i] = start_timestamp + np.timedelta64(i, 'm')

    # Simulate temperature (e.g., between 20°C and 40°C)
    iot_data['temperature_celsius'][i] =round( np.random.uniform(20.0, 40.0),2)

    # Simulate humidity (e.g., between 30% and 80%)
    iot_data['humidity_percent'][i] = round(np.random.uniform(30.0, 80.0),2)

    # Simulate motion detection (e.g., 20% chance of motion)
    iot_data['motion_detected'][i] = np.random.rand() < 0.2


print("--- Simulated IoT Sensor Data (First 5 Entries) ---")
print(iot_data[:5])
print(f"\nTotal simulated entries: {len(iot_data)}\n")

# --- 2. Filter Entries Based on Conditions ---

# Filter entries where temperature > 35 and motion is detected
# Create a boolean mask based on the conditions
# Note the use of parentheses for correct logical operation precedence
condition_mask = (iot_data['temperature_celsius'] > 35.0) & (iot_data['motion_detected'] == True)

# Apply the mask to get the filtered data
filtered_data = iot_data[condition_mask]

print("--- Filtered Data (Temperature > 35°C AND Motion Detected) ---")
if len(filtered_data) > 0:
    print(filtered_data)
else:
    print("No entries matched the filtering conditions.")
print(f"\nTotal filtered entries: {len(filtered_data)}\n")


# --- 3. Save Filtered Data to CSV ---

# Define the filename for the CSV
output_filename = "filtered_iot_sensor_data.csv"

# Prepare data for saving. np.savetxt works well with 2D numerical arrays.
# For structured arrays with mixed types, converting to a list of tuples or using fmt='%s'
# for all fields is a good approach for savetxt, {Link: says a Stack Overflow answer https://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file}.
# However, to maintain timestamp and boolean in readable format, it's often easiest to
# manually create the output lines.

# Create header string, {Link: according to DataCamp https://www.datacamp.com/doc/numpy/converting-arrays-into-csvs}.
header = "timestamp,temperature_celsius,humidity_percent,motion_detected"

# Save the filtered data to CSV
# Using a loop for cleaner output with mixed data types, especially for timestamps.
# You could also convert to a pandas DataFrame and then use .to_csv(), but this sticks to NumPy.
with open(output_filename, 'w') as f:
    f.write(header + '\n')  # Write the header
    for entry in filtered_data:
        # Format each entry into a comma-separated string
        line = f"{entry['timestamp']},{entry['temperature_celsius']},{entry['humidity_percent']},{entry['motion_detected']}\n"
        f.write(line)


print(f"Filtered data saved to {output_filename}")

--- Simulated IoT Sensor Data (First 5 Entries) ---
[('2025-07-15T18:00:00', 29.46, 75.84, False)
 ('2025-07-15T18:01:00', 30.06, 40.51, False)
 ('2025-07-15T18:02:00', 38.81, 56.74, False)
 ('2025-07-15T18:03:00', 33.37, 58.55, False)
 ('2025-07-15T18:04:00', 35.69, 38.72, False)]

Total simulated entries: 1000

--- Filtered Data (Temperature > 35°C AND Motion Detected) ---
[('2025-07-15T18:06:00', 39.93, 35.17,  True)
 ('2025-07-15T18:14:00', 36.12, 71.57,  True)
 ('2025-07-15T18:44:00', 38.88, 66.45,  True)
 ('2025-07-15T18:46:00', 38.38, 73.67,  True)
 ('2025-07-15T19:09:00', 38.34, 67.05,  True)
 ('2025-07-15T19:17:00', 37.22, 50.49,  True)
 ('2025-07-15T19:38:00', 38.64, 75.24,  True)
 ('2025-07-15T19:42:00', 36.34, 31.67,  True)
 ('2025-07-15T19:47:00', 35.3 , 39.79,  True)
 ('2025-07-15T19:50:00', 36.37, 34.63,  True)
 ('2025-07-15T20:05:00', 35.63, 77.92,  True)
 ('2025-07-15T20:19:00', 35.89, 34.57,  True)
 ('2025-07-15T20:29:00', 36.73, 52.47,  True)
 ('2025-07-15T20:40:00',