In [None]:
!python3 -m pip install kaggle

In [None]:
import kagglehub
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Download latest version
path = kagglehub.dataset_download("divyansh22/intel-berkeley-research-lab-sensor-data")
print("Path to dataset files:", path)
file_path = os.path.join(path, os.listdir(path)[0])
print(file_path)

In [None]:
def load_data(file_path):
    # Load the data from a text file without parsing dates upfront
    df = pd.read_csv(file_path, sep=' ', header=None, names=['date', 'time', 'epoch', 'moteid', 'temperature', 'humidity', 'light', 'voltage'])
    
    # Combine 'date' and 'time' columns into a single datetime column
    df['date_time'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
    
    # Drop rows where 'date_time' couldn't be parsed correctly
    df.dropna(subset=['date_time'], inplace=True)
    
    # Drop the original 'date' and 'time' columns
    df.drop(columns=['date', 'time'], inplace=True)
    
    # Sort by 'date_time'
    df.sort_values(by='date_time', inplace=True)
    
    # Drop rows where other columns contain NaN
    df.dropna(subset=['moteid', 'temperature', 'humidity', 'voltage', 'light'], inplace=True)
    df['moteid'] = df['moteid'].astype(int)
    df['timestamp'] = df['date_time'].dt.floor('30s')
    return df

In [None]:
df = load_data(file_path)