In [4]:
import pandas as pd
import numpy as np
import zipfile
import os

# Unzip the uploaded file
zip_file_path = '5stn873wft-1.zip'
unzip_dir = 'infosys/data/unzipped_data/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)

# Load the individual files
file1_path = os.path.join(unzip_dir, '1_20210317_184512.csv')
file2_path = os.path.join(unzip_dir, '2_20210317_171452.csv')

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Merge the two files based on a common key, such as Time (assuming Time is the common column)
df_merged = pd.concat([df1, df2], ignore_index=True)

# Load the accelerometer and gyroscope data with labels from 3_FinalDatasetCsv.csv
dataset_path = os.path.join(unzip_dir, '3_FinalDatasetCsv.csv')
df_acc_gyro = pd.read_csv(dataset_path)

# Check the first few rows to ensure the data structure
print("Merged Data Preview:")
print(df_merged.head())

print("Accelerometer and Gyroscope Data Preview:")
print(df_acc_gyro.head())

# Ensure the number of rows between merged data and acc/gyro data match
# If there's a mismatch, you might need to align based on timestamps or trim rows accordingly

# Join the datasets (assuming they are aligned by row or time)
# For now, we assume they are aligned by index (row-wise) since no specific common key is given
df_final = df_merged.copy()
df_final[['Acc X', 'Acc Y', 'Acc Z', 'gyro_x', 'gyro_y', 'gyro_z', 'label']] = df_acc_gyro[['Acc X', 'Acc Y', 'Acc Z', 'gyro_x', 'gyro_y', 'gyro_z', 'label']]

# Adjust speed for human context
speed_scale_factor = 0.2  # Adjust this based on your assumptions
df_final['Speed'] = df_final['Speed'] * speed_scale_factor

# Adjust labels (0: normal, 1: anomalous behavior)
df_final['label'] = df_final['label'].apply(lambda x: 0 if x == 0 else 1)

# Save the final adjusted dataset
adjusted_dataset_path = 'infosys/data/final_adjusted_crowd_dataset.csv'
df_final.to_csv(adjusted_dataset_path, index=False)

print(f"Final adjusted dataset saved to {adjusted_dataset_path}")

# Summary of Adjustments
print("Adjustments made:")
print("- Merged data from two files.")
print("- Added accelerometer and gyroscope data with labels.")
print("- Speed scaled down by factor of 0.2 to simulate human walking/running speeds.")
print("- Labels adjusted (0: normal behavior, 1: anomalous behavior).")


Merged Data Preview:
   Longitude   Latitude  Speed  Distance      Time     Acc X     Acc Y  \
0  73.822661  18.501627    0.0       0.0  18-45-12 -0.271978  0.239697   
1  73.822661  18.501627    0.0       0.0  18-45-12 -0.203242  0.330358   
2  73.822661  18.501627    0.0       0.0  18-45-13 -0.052430  0.283010   
3  73.822661  18.501627    0.0       0.0  18-45-13  0.046597  0.181215   
4  73.822661  18.501627    0.0       0.0  18-45-13  0.038631  0.341300   

      Acc Z  Heading    gyro_x    gyro_y    gyro_z  
0 -0.870133    352.0  0.022826 -0.035573  0.012482  
1  0.822501    352.0 -0.024821  0.045672 -0.016229  
2 -0.348588    352.0 -0.071247  0.027346 -0.015618  
3 -1.068336    352.0  0.028324 -0.009306 -0.015007  
4  0.365102    352.0  0.022215 -0.015414  0.014315  
Accelerometer and Gyroscope Data Preview:
      Acc X     Acc Y     Acc Z    gyro_x    gyro_y    gyro_z  label
0  0.046402 -0.137178 -0.282934 -0.036306 -0.008226 -0.023416      0
1 -0.136978  0.365242  0.108889  0.0

In [None]:
pip install imbalanced-learn pandas

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from datetime import timedelta

# Load the dataset
file_path = './data/final_adjusted_crowd_dataset.csv'  # Replace with the actual file path
dataset = pd.read_csv(file_path)

# Ensure that the 'Time' column is properly formatted
print('Converting Time column to datetime format...')

# Replace any hyphens with colons to match the HH:MM:SS format
dataset['Time'] = dataset['Time'].str.replace('-', ':')

# Now convert the column to datetime, assuming the corrected format is HH:MM:SS
dataset['Time'] = pd.to_datetime(dataset['Time'], format='%H:%M:%S', errors='coerce')

# Convert 'Time' to seconds since the start of the day
dataset['Time_in_seconds'] = dataset['Time'].dt.hour * 3600 + dataset['Time'].dt.minute * 60 + dataset['Time'].dt.second

# Drop the original 'Time' column and use 'Time_in_seconds' instead
X = dataset.drop(columns=['label', 'Time'])  # Use 'Time_in_seconds' and other features
y = dataset['label']

# Applying SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert 'Time_in_seconds' back to 'HH:MM:SS' format
X_resampled['Time'] = X_resampled['Time_in_seconds'].apply(lambda x: str(timedelta(seconds=int(x))))

# Drop the 'Time_in_seconds' column after conversion
X_resampled = X_resampled.drop(columns=['Time_in_seconds'])

# Combine the resampled data back into a dataframe
resampled_dataset = pd.concat([X_resampled, y_resampled], axis=1)

# Save the resampled dataset
resampled_dataset.to_csv('resampled_crowd_dataset_with_time_converted.csv', index=False)
print("Resampled dataset saved as 'resampled_crowd_dataset_with_time_converted.csv'")
