In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import gc

# Define the path to your dataset files
data_path = r"C:\Users\Nikhil\Desktop\gemini"
data_files = [
    "S1_Data.dat", "S2_train_10msps_1sec.dat", "S3_train_10msps_1sec.dat",
    "S4_train_10msps_1sec.dat", "S5_train_10msps_1sec.dat",
    "S6_train_10msps_1sec.dat", "S7_train_10msps_1sec.dat",
    "S8_data.dat", "S10_train_10msps_1sec.dat", "S11_train_10msps_1sec.dat",
    "S12_Data.dat"
]

# Initialize an empty DataFrame to hold all the data
all_data = pd.DataFrame()

# Load each file into a DataFrame
for file in data_files:
    file_path = os.path.join(data_path, file)

    try:
        if file in ["S1_Data.dat", "S8_data.dat", "S12_Data.dat"]:
            data = np.fromfile(file_path, dtype=np.complex128).astype(np.complex64)  # Use complex64 for memory efficiency
        else:
            data = np.fromfile(file_path, dtype=np.float64).astype(np.float32)  # Use float32 for memory efficiency

        data_df = pd.DataFrame(data, columns=['Feature1'])
        data_df['Machine'] = file.split('_')[0]

        print(f"Loaded {file} with shape: {data_df.shape}")

        all_data = pd.concat([all_data, data_df], ignore_index=True)

    except Exception as e:
        print(f"Error reading {file}: {e}")

print(f"Total data shape: {all_data.shape}")

# Feature Extraction
# Calculate magnitude and phase for complex data
all_data['Magnitude'] = np.abs(all_data['Feature1'])
all_data['Phase'] = np.angle(all_data['Feature1'])

# Drop the original complex feature and 'Machine' column
X = all_data[['Magnitude', 'Phase']]
y = all_data['Machine']

# Step 3: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Feature Selection with PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Visualizing PCA result on a subset of data
subset_size =113886080 # Adjust this size based on your system's memory capabilities
all_data_subset = all_data.sample(n=subset_size, random_state=42)

# Extract features and labels for the subset
X_subset = all_data_subset[['Magnitude', 'Phase']]
y_subset = all_data_subset['Machine']

# Standardize the features
X_subset_scaled = scaler.fit_transform(X_subset)

# Apply PCA for dimensionality reduction
X_subset_pca = pca.fit_transform(X_subset_scaled)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_subset_pca[:, 0], y=X_subset_pca[:, 1], hue=y_subset, palette="Set1", legend='full')
plt.title('PCA of Vibration Data (Subset)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Clean up memory
del X, y, X_scaled, X_pca
gc.collect()

# Model Development
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_subset_pca, y_subset, test_size=0.2, random_state=42)

# Model 1: k-Nearest Neighbors (k-NN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

print("k-NN Classification Report:")
print(classification_report(y_test, knn_pred))
print("k-NN Confusion Matrix:")
print(confusion_matrix(y_test, knn_pred))
print("k-NN Accuracy Score:", accuracy_score(y_test, knn_pred))

# Model 2: Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_test)

print("\nGaussian Naive Bayes Classification Report:")
print(classification_report(y_test, gnb_pred))
print("Gaussian Naive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, gnb_pred))
print("Gaussian Naive Bayes Accuracy Score:", accuracy_score(y_test, gnb_pred))

# Model 3: Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_pred))
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, dt_pred))
print("Decision Tree Accuracy Score:", accuracy_score(y_test, dt_pred))

# Hyperparameter Tuning Example with GridSearchCV for k-NN
knn_params = {'n_neighbors': [3, 5, 7], 'metric': ['euclidean', 'manhattan']}
knn_grid = GridSearchCV(knn, knn_params, cv=5)
knn_grid.fit(X_train, y_train)

print("\nBest k-NN Parameters:", knn_grid.best_params_)
print("Best k-NN Cross-Validation Score:", knn_grid.best_score_)

# Visualization of the confusion matrix for the best model
best_knn = knn_grid.best_estimator_
best_knn_pred = best_knn.predict(X_test)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, best_knn_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix for Best k-NN Model")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Clean up memory
del X_train, X_test, y_train, y_test, best_knn_pred
gc.collect()

Loaded S1_Data.dat with shape: (10000000, 2)
Loaded S2_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S3_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S4_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S5_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S6_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S7_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S8_data.dat with shape: (10000000, 2)
Loaded S10_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S11_train_10msps_1sec.dat with shape: (10485760, 2)
Loaded S12_Data.dat with shape: (10000000, 2)
Total data shape: (113886080, 2)


MemoryError: 

<Figure size 800x600 with 1 Axes>