In [None]:
import pandas as pd
import os # Using os.path, you can create and manipulate file and directory paths
import matplotlib.pyplot as plt

folder_path = 'c:\\Users\\magda\\Documents\\___DA_DATA\\PROJEKT_Czechitas_Honeywell\\csv_nove_soubory'
outputfolder = 'C:\\Users\\magda\\Documents\\___DA_DATA\\PROJEKT_Czechitas_Honeywell\\csv_new_outputs'


def dataprocessing(df, output_plot_path):

# The diff() method is used to compute the difference between values in consecutive rows (data points) within a single column. 
# It calculates the difference between the current value and the previous value for each row in the column. 
# Spoofing can be detected by a sudden and significant change in position or altitude. 
# Detection of such changes can be achieved by calculating the difference between timestamps in latitude, longitude, or altitude.

    df['delta_lat'] = df['g_lat'].diff()
    df['delta_lon'] = df['g_lon'].diff()
    df['delta_alt'] = df['g_alt_msl'].diff()
    
    # Threshold values for anomaly detection are set for latitude, longitude, and altitude.
    # abs represents the absolute value, ensuring negative numbers are converted to positive; we are interested in the magnitude of the change, disregarding its direction. The slash | means OR.
    # A change of 0.0005 degrees in latitude/longitude corresponds to approximately 55 meters on the Earth's surface.

    lat_threshold = 0.0005  
    lon_threshold = 0.0005  
    alt_threshold = 10      

    # Anomaly detection: returns the 'spoofing' column from the DataFrame df. 
    # This column contains True or False, indicating whether spoofing was detected for the respective row.

    df['spoofing'] = ((df['delta_lat'].abs() > lat_threshold) |
    (df['delta_lon'].abs() > lon_threshold) |
    (df['delta_alt'].abs() > alt_threshold))

    # Displays rows where a change (possible spoofing) was detected. Selects all rows from df where the value in the 'spoofing' column is True.
    # The result is a new DataFrame (spoofing_cases) that contains only the rows where spoofing was detected.

    spoofing_cases = df[df['spoofing']]

    # print(spoofing_cases)
    # Duration
    # Shifts the column using shift() by one position down.
    # df['prev_is_spoofing'] indicates the value in the previous row, whether spoofing was detected in the previous row (timestamp) or not.
    # It takes the value from the previous row and inverts it.

    df['prev_spoofing'] = df['spoofing'].shift(1, fill_value=False) # fill_value=False Fills the first row with the value False.

    # Identification of the start and end of spoofing:
    # Start of spoofing: This is determined by whether the current value is True and the previous value was False. 
    # The negation ~prev_spoofing == True means that the previous row was False and spoofing started in the current row.


    df['spoofing_start'] = (df['spoofing'] & ~df['prev_spoofing'])

    # End of spoofing:

    df['spoofing_end'] = (~df['spoofing'] & df['prev_spoofing'])

    # Time of the start of spoofing:

    start_times = df[df['spoofing_start']]['time']

    # Time of the end of spoofing:

    end_times = df[df['spoofing_end']]['time']

    # Duration of spoofing:

    spoofing_durations = end_times.values - start_times.values

    # Iterating over a list. The enumerate() function provides both the index of each element and the value of the element.
    # for i, duration in enumerate(spoofing_durations):
    #print(f'Spoofing {i+1} lasted: {duration:.2f} seconds')  # i+1 ensures numbering starts from 1, not 0. # .2f rounds to 2 decimal places

    # Total duration:

    total_duration = sum(spoofing_durations)

    # print(f'The total duration of all spoofing events is: {total_duration:.2f} seconds')

    import matplotlib.pyplot as plt

    # Creating a plot for latitude and longitude (drone's path)

    plt.plot(df['g_lon'], df['g_lat'], label='drone path')

    # Mark points with possible spoofing (red points)

    plt.scatter(df[df['spoofing']]['g_lon'], df[df['spoofing']]['g_lat'], color='red', label='Spoofing Detected')

    # Axis labels and legend
    
    plt.xlabel('Zeměpisná délka (rad)')
    plt.ylabel('Zeměpisná šířka (rad)')
    plt.legend()
    
    plt.savefig(output_plot_path, dpi=300)
    plt.close() 
    
    return df


In [None]:

dataframes = []

# Iterating through all files in a folder. The os.listdir() function is part of the os module in Python and is used 
# to obtain a list of files and folders in a specified directory. This function returns all files and folders 
# found in the specified folder as a list of strings (file and folder names).

for csv_file in os.listdir(folder_path):

    # Checking if the file is a CSV. The endswith('.csv') function is a string method in Python that checks 
    # if the given string ends with a specific sequence of characters, in this case, whether the file name is in CSV format.

    if csv_file.endswith('.csv'):
        try:
            file_path = os.path.join(folder_path, csv_file) 
            output_file_path = os.path.join(outputfolder, csv_file)
            output_plot_path = os.path.join(outputfolder, os.path.splitext(csv_file)[0] + '_plot.png')
            df_old = pd.read_csv(file_path)
            df_edited = dataprocessing(df_old, output_plot_path)
            df_edited['filename'] = csv_file
            df_edited.to_csv(output_file_path, index=False)
            dataframes.append(df_edited)
        except Exception as e:
            print(f"Chyba při načítání souboru {file_path}: {e}")

            # Check for CSV and save the processed DataFrame to CSV

Chyba při načítání souboru c:\Users\magda\Documents\___DA_DATA\PROJEKT_Czechitas_Honeywell\csv_nove_soubory\20240820_IDFtest_non_Test175.csv: Error tokenizing data. C error: Expected 16 fields in line 31, saw 18



In [None]:
len(dataframes) 

# The pd.concat() function combines all DataFrames in the list 'dataframes' into one large DataFrame.
# ignore_index=True: This argument ensures that the resulting DataFrame will have redefined (new) indices from 0 to n.
# The original indices of the individual DataFrames are ignored, and the DataFrames are concatenated as if they were continuous rows.

df_final = pd.concat(dataframes, ignore_index=True)

In [7]:
df_final

Unnamed: 0,time,g_lat,g_lon,g_alt_msl,g_fix_type,g_itow,g_pos_h_acc,g_pos_v_acc,g_vel_n,g_vel_e,...,g_vel_d.1,g_avg_cn0,delta_lat,delta_lon,delta_alt,spoofing,prev_spoofing,spoofing_start,spoofing_end,filename
0,0.069061,0.55250,0.604066,63.308,3,203994.0,2.936,5.986,-0.005,-0.013,...,-0.061,0,,,,False,False,False,False,1116 shidrug1_060827_test319.csv
1,1.049095,0.55250,0.604066,63.203,3,203995.0,2.761,5.743,0.004,0.012,...,-0.045,40,-6.981317e-09,3.490659e-09,-0.105,False,False,False,False,1116 shidrug1_060827_test319.csv
2,2.049073,0.55250,0.604066,63.224,3,203996.0,2.626,5.545,0.058,-0.062,...,-0.205,40,1.745329e-09,-3.490659e-09,0.021,False,False,False,False,1116 shidrug1_060827_test319.csv
3,3.049224,0.55250,0.604066,63.131,3,203997.0,2.517,5.377,0.079,-0.003,...,-0.188,40,0.000000e+00,1.745329e-09,-0.093,False,False,False,False,1116 shidrug1_060827_test319.csv
4,4.049170,0.55250,0.604066,63.098,3,203998.0,2.424,5.235,0.041,-0.122,...,-0.247,40,-1.745329e-09,-3.490659e-09,-0.033,False,False,False,False,1116 shidrug1_060827_test319.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68335,365.727766,0.59024,0.619431,180.544,3,229295.0,2.824,2.975,-0.139,-0.377,...,-0.362,38,-5.235988e-09,-3.141593e-08,0.228,False,False,False,False,20240820_IDFtest_Test188.csv
68336,366.727765,0.59024,0.619431,180.794,3,229296.0,3.414,3.229,0.020,0.126,...,-0.158,30,-1.221730e-08,-2.617994e-08,0.250,False,False,False,False,20240820_IDFtest_Test188.csv
68337,367.727790,0.59024,0.619431,180.727,3,229297.0,3.761,3.487,-0.235,-0.101,...,-0.279,26,-5.759587e-08,-8.726646e-09,-0.067,False,False,False,False,20240820_IDFtest_Test188.csv
68338,368.727777,0.59024,0.619431,180.410,3,229298.0,3.755,3.550,0.087,-0.042,...,-0.003,33,-2.268928e-08,-1.396263e-08,-0.317,False,False,False,False,20240820_IDFtest_Test188.csv


In [None]:
# Saving the DataFrame 'df' to a CSV file 'output.csv' without including the DataFrame indices (row numbers) in the saved CSV file.
# If index=False were not used, the indices would be saved as a separate column in the CSV file.

df_final.to_csv('finaldataset.csv', index=False)