In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
base_folder_path = r'C:\Fault-Diagnosis-of-AUV\Dataset\Dataset' 

In [14]:
dataset_types = ['train', 'test']
fault_types = {
    'AddWeight': 'load_increase',
    'Normal': 'normal_state',
    'PressureGain_constant': 'depth_sensor_failure',
    'PropellerDamage_bad': 'severe_propeller_damage',
    'PropellerDamage_slight': 'slight_propeller_damage'
}

In [15]:
import os
combined_data = {'train': [], 'test': []}

for dataset_type in dataset_types:
    dataset_path = os.path.join(base_folder_path, dataset_type)
    for fault_folder, fault_label in fault_types.items():
        fault_path = os.path.join(dataset_path, fault_folder)
        if os.path.exists(fault_path):
            for file_name in os.listdir(fault_path):
                if file_name.endswith('.csv'):
                    file_path = os.path.join(fault_path, file_name)
                    try:
                        df = pd.read_csv(file_path)
                        df['Fault_Type'] = fault_label
                        df['Dataset_Type'] = dataset_type
                        combined_data[dataset_type].append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

# Combine all DataFrames into a single DataFrame for each dataset type
train_df = pd.concat(combined_data['train'], ignore_index=True) if combined_data['train'] else pd.DataFrame()
test_df = pd.concat(combined_data['test'], ignore_index=True) if combined_data['test'] else pd.DataFrame()

# Display the first few rows of the combined DataFrames
print("Training Data:")
print(train_df.head())
print("\nTest Data:")
print(test_df.head())


Training Data:
Empty DataFrame
Columns: []
Index: []

Test Data:
Empty DataFrame
Columns: []
Index: []


In [6]:
# combine train and test datasets
combined_df = pd.concat([train_df, test_df], ignore_index=True)


In [7]:
combined_df.shape

(0, 0)

In [8]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [9]:
combined_df.dtypes

Series([], dtype: object)

In [10]:
print(combined_df)


Empty DataFrame
Columns: []
Index: []


In [11]:
print(combined_df.columns)


RangeIndex(start=0, stop=0, step=1)


In [12]:
combined_df.describe().T

ValueError: Cannot describe a DataFrame without columns

In [None]:
# Select only the numerical columns for correlation calculation
num_columns = ['time', 'pwm1', 'pwm2', 'pwm3', 'pwm4', 'depth', 'press', 'voltage',
                     'roll', 'pitch', 'yaw', 'a_x', 'a_y', 'a_z', 'w_row', 'w_pitch', 'w_yaw']
correlation_matrix = combined_df[num_columns].corr()

In [None]:
# Set up the matplotlib figure
plt.figure(figsize=(16, 17))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)

# Set the title for the heatmap
plt.title('Correlation Matrix Heatmap')

# Show the plot
plt.show()


In [None]:
combined_df["Fault_Type"].unique()

In [None]:
combined_df["Fault_Type"].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()

In [None]:
print(label_encoder)

In [None]:
combined_df["Fault_Type"] = label_encoder.fit_transform(combined_df["Fault_Type"])

In [None]:
combined_df.head()

In [None]:
# Display statistical summary of the DataFrame
print(combined_df.describe())


In [None]:
# Check for missing values
print(combined_df.isnull().sum())


In [None]:
import matplotlib.pyplot as plt

# Plot histograms for each feature
combined_df.hist(figsize=(15, 15))
plt.show()


In [None]:
combined_df["Fault_Type"].unique()

In [None]:
combined_df["Fault_Type"].value_counts()

In [None]:
combined_df.head()

In [None]:
combined_df['label'] = combined_df['voltage'].apply(lambda x: "fault" if x > 12.06 else "normal")

# Display the first few rows with the new label
combined_df.head()

In [None]:
# Display the first few rows of the DataFrame
print(combined_df.head())



In [None]:
# Display the last few rows of the DataFrame
print(combined_df.tail())


In [None]:
# Display basic information about the DataFrame
print(combined_df.info())


In [None]:
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_df.drop(columns=['label']))

# Create a DataFrame with scaled features
scaled_df = pd.DataFrame(scaled_features, index=combined_df.index, columns=combined_df.columns[:-1])
scaled_df['label'] = combined_df['label']


In [None]:
import pandas as pd
import os

# Define the path to the folder containing the datasets
folder_path = r'E:\Fault-Diagnosis-of-AUV\Dataset\Dataset'   # Update this path to your folder

# Define the labeling function based on specific criteria
def label_data(row):
    if row['voltage'] < 12.0:  # Update this condition as needed
        return 'fault'
    else:
        return 'normal'

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        
        try:
            # Load the dataset
            data = pd.read_csv(file_path)
            
            # Check if 'voltage' column exists
            if 'voltage' not in data.columns:
                print(f"Skipping {filename}: 'voltage' column not found")
                continue
            
            # Apply the labeling function to each row in the dataset
            data['label'] = data.apply(label_data, axis=1)
            
            # Print the labeled data
            print(f"Labeled data for {filename}:")
            print(data.head())  # Adjust the number of rows to print as needed
            
            # Define the path to save the labeled dataset
            labeled_file_path = os.path.join(folder_path, f'labeled_{filename}')
            
            # Save the labeled dataset to a new CSV file
            data.to_csv(labeled_file_path, index=False)
            
            print(f"Labeled dataset saved to: {labeled_file_path}")
        
        except Exception as e:
            print(f"Error processing {filename}: {e}")


In [None]:
def label_data(row):
    if row['voltage'] < 12.0:  # Update this condition as needed
        return 'fault'
    else:
        return 'normal'

# Loop through each file in the folder for labeling
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        
        try:
            # Load the dataset
            data = pd.read_csv(file_path)
            
            # Check if 'voltage' column exists
            if 'voltage' not in data.columns:
                print(f"Skipping {filename}: 'voltage' column not found")
                continue
            
            # Apply the labeling function to each row in the dataset
            data['label'] = data.apply(label_data, axis=1)
            
            # Print the labeled data
            print(f"Labeled data for {filename}:")
            print(data.head())  # Adjust the number of rows to print as needed
            
            # Define the path to save the labeled dataset
            labeled_file_path = os.path.join(folder_path, f'labeled_{filename}')
            
            # Save the labeled dataset to a new CSV file
            data.to_csv(labeled_file_path, index=False)
            
            print(f"Labeled dataset saved to: {labeled_file_path}")
        
        except Exception as e:
            print(f"Error processing {filename}: {e}")

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define the path where your datasets are stored
input_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset'
output_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset\\cleaned_labeled_train'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# List all CSV files in the input folder
file_list = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

# Function to clean and label a dataset
def clean_and_label_dataset(file_path):
    df = pd.read_csv(file_path)

    # Handle missing values (example: fill with mean)
    df.fillna(df.mean(), inplace=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Convert data types if necessary
    if not pd.api.types.is_datetime64_any_dtype(df['time']):
        df['time'] = pd.to_datetime(df['time'], errors='coerce')

    # Handle outliers (example: capping values to 1.5*IQR)
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

    # Normalize/scale data if needed (example: standard scaling)
    scaler = StandardScaler()
    df[df.columns.difference(['time', 'label'])] = scaler.fit_transform(df[df.columns.difference(['time', 'label'])])

    # Add a label column based on depth
    df['label'] = df['depth'].apply(lambda x: 1 if x > 0.05 else 0)

    return df

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define the path where your datasets are stored
input_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset'
output_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset\\cleaned_labeled_train'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# List all CSV files in the input folder
file_list = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

# Function to clean and label a dataset
def clean_labeled_train(file_path):
    df = pd.read_csv(file_path)
    print(f"Processing file: {file_path}")

    # Handle missing values (example: fill with mean)
    df.fillna(df.mean(), inplace=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Convert data types if necessary
    if not pd.api.types.is_datetime64_any_dtype(df['time']):
        df['time'] = pd.to_datetime(df['time'], errors='coerce')

    # Handle outliers (example: capping values to 1.5*IQR)
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

    # Normalize/scale data if needed (example: standard scaling)
    scaler = StandardScaler()
    columns_to_scale = df.columns.difference(['time', 'label'])
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    # Add a label column based on depth
    df['label'] = df['depth'].apply(lambda x: 1 if x > 0.05 else 0)

    return df

# Process each file in the input folder
for file_name in file_list:
    file_path = os.path.join(input_folder, file_name)
    cleaned_df = clean_labeled_train(file_path)
    
    # Save the cleaned and labeled dataframe to the output folder
    output_file_path = os.path.join(output_folder, file_name)
    cleaned_df.to_csv(output_file_path, index=False)
    print(f"Saved cleaned data to: {output_file_path}")

print("Processing complete.")


In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define the path where your datasets are stored
input_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset'
output_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset\\cleaned_labeled_train'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# List all CSV files in the input folder
file_list = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

# Function to clean and label a dataset
def clean_and_label_dataset(file_path):
    print(f"Processing file: {file_path}")
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

    if df.empty:
        print(f"File {file_path} is empty. Skipping.")
        return None

    print("Initial data:")
    print(df.head())

    # Handle missing values (example: fill with mean)
    df.fillna(df.mean(), inplace=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Convert data types if necessary
    if 'time' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['time']):
        df['time'] = pd.to_datetime(df['time'], errors='coerce')

    print("Data after handling missing values and duplicates:")
    print(df.head())

    # Handle outliers (example: capping values to 1.5*IQR)
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

    print("Data after handling outliers:")
    print(df.head())

    # Normalize/scale data if needed (example: standard scaling)
    scaler = StandardScaler()
    columns_to_scale = df.columns.difference(['time', 'label'])
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    print("Data after scaling:")
    print(df.head())

    # Add a label column based on depth
    if 'depth' in df.columns:
        df['label'] = df['depth'].apply(lambda x: 1 if x > 0.05 else 0)
    else:
        print("Depth column not found. Skipping labeling.")
        return None

    print("Data after adding label column:")
    print(df.head())

    return df

# Process each file in the input folder
for file_name in file_list:
    file_path = os.path.join(input_folder, file_name)
    cleaned_df = clean_and_label_dataset(file_path)
    
    if cleaned_df is not None:
        # Save the cleaned and labeled dataframe to the output folder
        output_file_path = os.path.join(output_folder, file_name)
        try:
            cleaned_df.to_csv(output_file_path, index=False)
            print(f"Saved cleaned data to: {output_file_path}")
        except Exception as e:
            print(f"Error saving file {output_file_path}: {e}")
    else:
        print(f"Skipping file: {file_path}")

print("Processing complete.")


In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Function to clean and label a dataset
def clean_and_label_dataset(df):
    # Handle missing values (example: fill with mean)
    df.fillna(df.mean(), inplace=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Convert data types if necessary
    if 'time' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['time']):
        df['time'] = pd.to_datetime(df['time'], errors='coerce')

    # Handle outliers (example: capping values to 1.5*IQR)
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

    # Normalize/scale data if needed (example: standard scaling)
    scaler = StandardScaler()
    columns_to_scale = df.columns.difference(['time', 'label'])
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    # Add a label column based on depth
    if 'depth' in df.columns:
        df['label'] = df['depth'].apply(lambda x: 1 if x > 0.05 else 0)
    else:
        print("Depth column not found. Skipping labeling.")
        return None

    return df

# Define the path where your datasets are stored
input_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset'
output_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset\\cleaned_labeled_train'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# List all CSV files in the input folder
file_list = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

# Process each file in the input folder
all_data = pd.DataFrame()
for file_name in file_list:
    file_path = os.path.join(input_folder, file_name)
    print(f"Processing file: {file_path}")
    try:
        df = pd.read_csv(file_path)
        cleaned_df = clean_and_label_dataset(df)
        if cleaned_df is not None:
            output_file_path = os.path.join(output_folder, file_name)
            cleaned_df.to_csv(output_file_path, index=False)
            print(f"Saved cleaned data to: {output_file_path}")
            all_data = pd.concat([all_data, cleaned_df], ignore_index=True)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("Processing complete.")

# Display the cleaned and labeled dataset
if not all_data.empty:
    display(all_data.head())

    # Calculate the correlation matrix
    correlation_matrix = all_data.corr()

    # Set up the matplotlib figure
    plt.figure(figsize=(16, 17))

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)

    # Set the title for the heatmap
    plt.title('Correlation Matrix Heatmap')

    # Show the plot
    plt.show()
else:
    print("No data available to plot.")


In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define the path where your datasets are stored
input_folder = 'E:\\Fault-Diagnosis-of-AUV\\Dataset\\Dataset'

# List all CSV files in the input folder
file_list = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

# Function to label a dataset based on depth
def label_dataset(file_path):
    try:
        df = pd.read_csv(file_path)
        # Add a label column based on depth
        if 'depth' in df.columns:
            df['label'] = df['depth'].apply(lambda x: 1 if x > 0.05 else 0)
            return df
        else:
            print(f"Depth column not found in file {file_path}. Skipping labeling.")
            return None
    except Exception as e:
        print(f"Error reading or labeling file {file_path}: {e}")
        return None

# Process each file in the input folder
all_data = pd.DataFrame()
for file_name in file_list:
    file_path = os.path.join(input_folder, file_name)
    print(f"Labeling dataset for file: {file_path}")
    labeled_df = label_dataset(file_path)
    if labeled_df is not None:
        all_data = pd.concat([all_data, labeled_df], ignore_index=True)

# Display the labeled dataset
if not all_data.empty:
    display(all_data.head())

    # Calculate the correlation matrix
    correlation_matrix = all_data.corr()

    # Set up the matplotlib figure
    plt.figure(figsize=(16, 17))

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)

    # Set the title for the heatmap
    plt.title('Correlation Matrix Heatmap')

    # Show the plot
    plt.show()
else:
    print("No data available to display.")



In [None]:
combined_df['label'] = combined_df['voltage'].apply(lambda x: "fault" if x > 12.06 else "normal")

# Display the first few rows with the new label
combined_df.head()

In [None]:
import tensorflow as tf
print("TensorFlow imported successfully")

In [None]:
!pip show tensorflow

In [None]:
!pip install tensorflow

In [None]:
import tensorflow as tf
print("TensorFlow imported successfully")