In [12]:
import os
import pandas as pd

# Path to the folder containing .txt files
folder_path = '/Users/miro/Documents/ARAS/mdm-master_CLEAN/examples/casas/'

# Get a list of all .txt files in the folder
txt_files = [file for file in os.listdir(folder_path) if file.endswith('.txt')]

# Initialize an empty list to store DataFrames
dfs = []

# Read each .txt file into a DataFrame and append it to the list
for file in txt_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, delimiter='\t', header=None)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the columns
combined_df = combined_df.rename(columns={0: 'date', 1: 'time', 2: 'sensor', 3: 'activity'})

# Display the combined DataFrame
print(combined_df)



             date             time sensor       activity
0      2008-11-20   14:17:37.22788    M15         ON 1 1
1      2008-11-20  14:17:37.842819    M16         ON 1 1
2      2008-11-20    14:17:39.4507    D07       OPEN 1 1
3      2008-11-20  14:17:39.866019    M17         ON 1 1
4      2008-11-20  14:17:40.266839    M23         ON 2 2
...           ...              ...    ...            ...
17229  2008-12-08   08:54:05.97179    M19   ON 1 14 2 15
17230  2008-12-08  08:54:07.642649    M18   ON 1 14 2 15
17231  2008-12-08  08:54:08.216989    M17   ON 1 14 2 15
17232  2008-12-08  08:54:09.599459    M19  OFF 1 14 2 15
17233  2008-12-08   08:54:11.00337    M18  OFF 1 14 2 15

[17234 rows x 4 columns]


In [13]:
# Split the 'activity' column at the first space into two columns
combined_df[['value', 'activity']] = combined_df['activity'].str.split(n=1, expand=True)



In [14]:
# Reorder the columns to swap the location of 'value' and 'activity' columns
combined_df = combined_df[['date', 'time', 'sensor', 'value', 'activity']]

# Display the DataFrame
print(combined_df)


             date             time sensor value   activity
0      2008-11-20   14:17:37.22788    M15    ON        1 1
1      2008-11-20  14:17:37.842819    M16    ON        1 1
2      2008-11-20    14:17:39.4507    D07  OPEN        1 1
3      2008-11-20  14:17:39.866019    M17    ON        1 1
4      2008-11-20  14:17:40.266839    M23    ON        2 2
...           ...              ...    ...   ...        ...
17229  2008-12-08   08:54:05.97179    M19    ON  1 14 2 15
17230  2008-12-08  08:54:07.642649    M18    ON  1 14 2 15
17231  2008-12-08  08:54:08.216989    M17    ON  1 14 2 15
17232  2008-12-08  08:54:09.599459    M19   OFF  1 14 2 15
17233  2008-12-08   08:54:11.00337    M18   OFF  1 14 2 15

[17234 rows x 5 columns]


In [15]:
# Extract the second integer from the 'activity' column
combined_df['activity'] = combined_df['activity'].str.split().str[1]

# Display the DataFrame
print(combined_df)

             date             time sensor value activity
0      2008-11-20   14:17:37.22788    M15    ON        1
1      2008-11-20  14:17:37.842819    M16    ON        1
2      2008-11-20    14:17:39.4507    D07  OPEN        1
3      2008-11-20  14:17:39.866019    M17    ON        1
4      2008-11-20  14:17:40.266839    M23    ON        2
...           ...              ...    ...   ...      ...
17229  2008-12-08   08:54:05.97179    M19    ON       14
17230  2008-12-08  08:54:07.642649    M18    ON       14
17231  2008-12-08  08:54:08.216989    M17    ON       14
17232  2008-12-08  08:54:09.599459    M19   OFF       14
17233  2008-12-08   08:54:11.00337    M18   OFF       14

[17234 rows x 5 columns]


In [16]:
# Check for unique values in the 'sensor' column
unique_sensors = combined_df['sensor'].unique()

# Display the unique sensor values
print(unique_sensors)


['M15' 'M16' 'D07' 'M17' 'M23' 'D12' 'M02' 'M01' 'I04' 'I06' 'M07' 'M08'
 'M14' 'M06' 'M09' 'M04' 'M03' 'M22' 'M21' 'M19' 'M18' 'M51' 'D11' 'M11'
 'M12' 'M13' 'D14' 'M10' 'D13' 'D10' 'M05' 'D15' 'D09' 'M25' 'M24' 'M26'
 'M20']


In [17]:
# Define the mapping dictionary
mapping = {
    'M01': 'MotionSensorAA', 'M02': 'MotionSensorA', 'M03': 'MotionSensorB', 'M04': 'MotionSensorC',
    'M05': 'MotionSensorD', 'M06': 'MotionSensorE', 'M07': 'MotionSensorF', 'M08': 'MotionSensorG',
    'M09': 'MotionSensorH', 'M10': 'MotionSensorI', 'M11': 'MotionSensorJ', 'M12': 'MotionSensorK',
    'M13': 'MotionSensorL', 'M14': 'MotionSensorM', 'M15': 'MotionSensorN', 'M16': 'MotionSensorO',
    'M17': 'MotionSensorP', 'M18': 'MotionSensorQ', 'M19': 'MotionSensorR', 'M21': 'MotionSensorS',
    'M22': 'MotionSensorT', 'M23': 'MotionSensorU', 'M26': 'MotionSensorV', 'M51': 'MotionSensorW',
    'D01': 'DoorSensorA', 'D07': 'DoorSensorB', 'D10': 'DoorSensorC', 'D11': 'DoorSensorD',
    'D12': 'DoorSensorE', 'D13': 'DoorSensorF', 'D14': 'DoorSensorG', 'L45': 'LightControllerA',
    'L46': 'LightControllerB', 'L47': 'LightControllerC', 'L48': 'LightControllerD', 'L49': 'LightControllerE',
    'L50': 'LightControllerF', 'L51': 'LightControllerG', 'L52': 'LightControllerH', 'L53': 'LightControllerI',
    'L54': 'LightControllerJ', 'L55': 'LightControllerK', 'L56': 'LightControllerL', 'L57': 'LightControllerM',
    'L58': 'LightControllerN', 'L59': 'LightControllerO', 'L60': 'LightControllerP', 'L61': 'LightControllerQ',
    'L62': 'LightControllerR', 'L63': 'LightControllerS', 'L64': 'LightControllerT'
}

# Map the numeric values to strings without integers
combined_df['sensor'] = combined_df['sensor'].map(mapping)

# Display the DataFrame
print(combined_df)


             date             time         sensor value activity
0      2008-11-20   14:17:37.22788  MotionSensorN    ON        1
1      2008-11-20  14:17:37.842819  MotionSensorO    ON        1
2      2008-11-20    14:17:39.4507    DoorSensorB  OPEN        1
3      2008-11-20  14:17:39.866019  MotionSensorP    ON        1
4      2008-11-20  14:17:40.266839  MotionSensorU    ON        2
...           ...              ...            ...   ...      ...
17229  2008-12-08   08:54:05.97179  MotionSensorR    ON       14
17230  2008-12-08  08:54:07.642649  MotionSensorQ    ON       14
17231  2008-12-08  08:54:08.216989  MotionSensorP    ON       14
17232  2008-12-08  08:54:09.599459  MotionSensorR   OFF       14
17233  2008-12-08   08:54:11.00337  MotionSensorQ   OFF       14

[17234 rows x 5 columns]


In [18]:
# Define the mapping dictionary
mapping = {
    "1": "Fill_Medication",
    "2": "Hang_Clothes",
    "3": "Move_Furniture",
    "4": "Read_Magazine",
    "5": "Water_Plants",
    "6": "Sweep_Floor",
    "7": "Play_Checkers",
    "8": "Set_Dinner",
    "9": "Set_Table",
    "10": "Read_Magazine_2",
    "11": "Pay_Bill",
    "12": "Pack_Picnic",
    "13": "Retrieve_Dishes",
    "14": "Pack_Supplies",
    "15": "Bring_Basket"
}

# Map the activity numbers to their abbreviations
combined_df['activity'] = combined_df['activity'].astype(str).map(mapping)

# Display the DataFrame
print(combined_df)


             date             time         sensor value         activity
0      2008-11-20   14:17:37.22788  MotionSensorN    ON  Fill_Medication
1      2008-11-20  14:17:37.842819  MotionSensorO    ON  Fill_Medication
2      2008-11-20    14:17:39.4507    DoorSensorB  OPEN  Fill_Medication
3      2008-11-20  14:17:39.866019  MotionSensorP    ON  Fill_Medication
4      2008-11-20  14:17:40.266839  MotionSensorU    ON     Hang_Clothes
...           ...              ...            ...   ...              ...
17229  2008-12-08   08:54:05.97179  MotionSensorR    ON    Pack_Supplies
17230  2008-12-08  08:54:07.642649  MotionSensorQ    ON    Pack_Supplies
17231  2008-12-08  08:54:08.216989  MotionSensorP    ON    Pack_Supplies
17232  2008-12-08  08:54:09.599459  MotionSensorR   OFF    Pack_Supplies
17233  2008-12-08   08:54:11.00337  MotionSensorQ   OFF    Pack_Supplies

[17234 rows x 5 columns]


In [19]:
# Duplicate the 'sensor' column and insert it after the original 'sensor' column
combined_df.insert(3, 'sensorid', combined_df['sensor'])

# Display the DataFrame
print(combined_df)


             date             time         sensor       sensorid value  \
0      2008-11-20   14:17:37.22788  MotionSensorN  MotionSensorN    ON   
1      2008-11-20  14:17:37.842819  MotionSensorO  MotionSensorO    ON   
2      2008-11-20    14:17:39.4507    DoorSensorB    DoorSensorB  OPEN   
3      2008-11-20  14:17:39.866019  MotionSensorP  MotionSensorP    ON   
4      2008-11-20  14:17:40.266839  MotionSensorU  MotionSensorU    ON   
...           ...              ...            ...            ...   ...   
17229  2008-12-08   08:54:05.97179  MotionSensorR  MotionSensorR    ON   
17230  2008-12-08  08:54:07.642649  MotionSensorQ  MotionSensorQ    ON   
17231  2008-12-08  08:54:08.216989  MotionSensorP  MotionSensorP    ON   
17232  2008-12-08  08:54:09.599459  MotionSensorR  MotionSensorR   OFF   
17233  2008-12-08   08:54:11.00337  MotionSensorQ  MotionSensorQ   OFF   

              activity  
0      Fill_Medication  
1      Fill_Medication  
2      Fill_Medication  
3      Fill

In [20]:
print(combined_df)


             date             time         sensor       sensorid value  \
0      2008-11-20   14:17:37.22788  MotionSensorN  MotionSensorN    ON   
1      2008-11-20  14:17:37.842819  MotionSensorO  MotionSensorO    ON   
2      2008-11-20    14:17:39.4507    DoorSensorB    DoorSensorB  OPEN   
3      2008-11-20  14:17:39.866019  MotionSensorP  MotionSensorP    ON   
4      2008-11-20  14:17:40.266839  MotionSensorU  MotionSensorU    ON   
...           ...              ...            ...            ...   ...   
17229  2008-12-08   08:54:05.97179  MotionSensorR  MotionSensorR    ON   
17230  2008-12-08  08:54:07.642649  MotionSensorQ  MotionSensorQ    ON   
17231  2008-12-08  08:54:08.216989  MotionSensorP  MotionSensorP    ON   
17232  2008-12-08  08:54:09.599459  MotionSensorR  MotionSensorR   OFF   
17233  2008-12-08   08:54:11.00337  MotionSensorQ  MotionSensorQ   OFF   

              activity  
0      Fill_Medication  
1      Fill_Medication  
2      Fill_Medication  
3      Fill

In [21]:
# Remove rows with NaN values
combined_df_cleaned = combined_df.dropna()

# Display the cleaned DataFrame
print(combined_df_cleaned)

             date             time         sensor       sensorid value  \
0      2008-11-20   14:17:37.22788  MotionSensorN  MotionSensorN    ON   
1      2008-11-20  14:17:37.842819  MotionSensorO  MotionSensorO    ON   
2      2008-11-20    14:17:39.4507    DoorSensorB    DoorSensorB  OPEN   
3      2008-11-20  14:17:39.866019  MotionSensorP  MotionSensorP    ON   
4      2008-11-20  14:17:40.266839  MotionSensorU  MotionSensorU    ON   
...           ...              ...            ...            ...   ...   
17229  2008-12-08   08:54:05.97179  MotionSensorR  MotionSensorR    ON   
17230  2008-12-08  08:54:07.642649  MotionSensorQ  MotionSensorQ    ON   
17231  2008-12-08  08:54:08.216989  MotionSensorP  MotionSensorP    ON   
17232  2008-12-08  08:54:09.599459  MotionSensorR  MotionSensorR   OFF   
17233  2008-12-08   08:54:11.00337  MotionSensorQ  MotionSensorQ   OFF   

              activity  
0      Fill_Medication  
1      Fill_Medication  
2      Fill_Medication  
3      Fill

In [22]:
combined_df_cleaned.sensor.unique()

array(['MotionSensorN', 'MotionSensorO', 'DoorSensorB', 'MotionSensorP',
       'MotionSensorU', 'DoorSensorE', 'MotionSensorA', 'MotionSensorAA',
       'MotionSensorF', 'MotionSensorG', 'MotionSensorM', 'MotionSensorE',
       'MotionSensorH', 'MotionSensorC', 'MotionSensorB', 'MotionSensorT',
       'MotionSensorS', 'MotionSensorR', 'MotionSensorQ', 'MotionSensorW',
       'DoorSensorD', 'MotionSensorJ', 'MotionSensorK', 'MotionSensorL',
       'DoorSensorG', 'MotionSensorI', 'DoorSensorF', 'DoorSensorC',
       'MotionSensorD', 'MotionSensorV'], dtype=object)

In [23]:
# Replace 'output_file.csv' with the desired filename and path
output_file = 'casas_transformed.csv'

# Save the DataFrame to a CSV file without header, separating columns by spaces
combined_df_cleaned.to_csv(output_file, index=False, header=False, sep=' ')

print(f"DataFrame saved to {output_file} without header, columns separated by spaces")



DataFrame saved to casas_transformed.csv without header, columns separated by spaces


In [78]:
# Find rows with NaN values
rows_with_nan = df[df.isna().any(axis=1)]

# Display the rows with NaN values
print(rows_with_nan)


           date             time sensor sensorid    value         activity
16   2008-11-10  14:28:32.473779    NaN      NaN   ABSENT  Fill_Medication
21   2008-11-10   14:28:36.62255    NaN      NaN   ABSENT  Fill_Medication
164  2008-11-10   14:33:11.57974    NaN      NaN  PRESENT  Fill_Medication
170  2008-11-10    14:33:22.2011    NaN      NaN  PRESENT  Fill_Medication
171  2008-11-10  14:33:24.046519    NaN      NaN   ABSENT  Fill_Medication
172  2008-11-10   14:33:24.66789    NaN      NaN  PRESENT  Fill_Medication


In [41]:
import os
import pandas as pd
from gensim.models import Word2Vec

# Load the CSV file
csv_file_path = '/Users/miro/Documents/ARAS/activity_segmentation_copy_2_test/next_action_prediction/casas_transformed.csv'
df = pd.read_csv(csv_file_path, sep=' ')

# Assuming your sensor data is in the third column
sensor_data_column = df.iloc[:, 2]

# Convert the sensor data column to a list of lists (sequences)
data = sensor_data_column.apply(lambda x: x.split()).tolist()

# Define filenames based on the provided filenames
filenames = [
    "word2vec_retrofitted_activities_locations_from_data_embedding_size_50_iterations_100_word2vec_window_5",
    "word2vec_embedding_size_50_iterations_5_word2vec_window_5",
    "word2vec_embedding_size_50_iterations_100_word2vec_window_1",
    "word2vec_embedding_size_50_iterations_100_word2vec_window_5",
    "word2vec_retrofitted_activities_embedding_size_50_iterations_5_word2vec_window_5",
    "word2vec_retrofitted_activities_embedding_size_50_iterations_100_word2vec_window_5",
    "word2vec_retrofitted_activities_from_data_embedding_size_50_iterations_5_word2vec_window_5",
    "word2vec_retrofitted_activities_from_data_embedding_size_50_iterations_100_word2vec_window_5",
    "word2vec_retrofitted_activities_locations_embedding_size_50_iterations_5_word2vec_window_5",
    "word2vec_retrofitted_activities_locations_embedding_size_50_iterations_100_word2vec_window_5",
    "word2vec_retrofitted_activities_locations_from_data_embedding_size_50_iterations_5_word2vec_window_5",
    "word2vec_retrofitted_locations_embedding_size_50_iterations_5_word2vec_window_5",
    "word2vec_retrofitted_locations_embedding_size_50_iterations_100_word2vec_window_5",
    "word2vec_retrofitted_locations_from_data_embedding_size_50_iterations_5_word2vec_window_5",
    "word2vec_retrofitted_locations_from_data_embedding_size_50_iterations_100_word2vec_window_5"
]

# Define the directory to save the files
save_dir = '/Users/miro/Documents/ARAS/activity_segmentation_copy_2_test/next_action_prediction/word2vec_models/new'

# Define corresponding parameters for each filename
parameters = [
    {"size": 50, "iter": 100, "window": 5},
    {"size": 50, "iter": 5, "window": 5},
    {"size": 50, "iter": 100, "window": 1},
    {"size": 50, "iter": 100, "window": 5},
    {"size": 50, "iter": 5, "window": 5},
    {"size": 50, "iter": 100, "window": 5},
    {"size": 50, "iter": 5, "window": 5},
    {"size": 50, "iter": 100, "window": 5},
    {"size": 50, "iter": 5, "window": 5},
    {"size": 50, "iter": 100, "window": 5},
    {"size": 50, "iter": 5, "window": 5},
    {"size": 50, "iter": 5, "window": 5},
    {"size": 50, "iter": 100, "window": 5},
    {"size": 50, "iter": 5, "window": 5},
    {"size": 50, "iter": 100, "window": 5}
]

# Train and save Word2Vec models for each filename
for filename, params in zip(filenames, parameters):
    model = Word2Vec(sentences=data, vector_size=params["size"], window=params["window"], min_count=1, epochs=params["iter"])
    save_path = os.path.join(save_dir, filename)
    
    # Save the embeddings with sensor name and embeddings separated by comma and embeddings separated by space
    with open(save_path, 'w') as file:
        for word in model.wv.index_to_key:
            embedding = ' '.join(map(str, model.wv[word]))
            line = f"{word},{embedding}\n"
            file.write(line)
