# Dimensionality Reduction

In [None]:
#Loading libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import classification_report, confusion_matrix 

from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score
import re
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../data/train_processed.csv', header=0)
test = pd.read_csv('../data/test_processed.csv', header=0)

print(train.shape, test.shape)

In [None]:
data = train
pca = PCA().fit(data)
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance)

plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid()
plt.show()


In [None]:
scaler = StandardScaler()
data = scaler.fit_transform(data)


In [None]:
pca = PCA().fit(data)
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance)

plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid()
plt.show()

In [None]:
def apply_pca(data, n):
    pca = PCA(n_components=n)

    data_normalized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)

    cov_matrix = np.cov(data_normalized.T)
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    eigen_pairs = [(np.abs(eigenvalues[i]), eigenvectors[:, i]) for i in range(len(eigenvalues))]
    eigen_pairs.sort(key=lambda x: x[0], reverse=True)

    k = 115
    top_k_eigenvectors = np.array([eigen_pair[1] for eigen_pair in eigen_pairs[:k]])

    # Project the original data onto the new subspace
    data_pca = np.dot(data_normalized, top_k_eigenvectors.T)
    return data_pca

In [None]:
X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

In [None]:
pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_pca, y_train)

In [None]:
rfc_predictions = rfc.predict(X_test_pca) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

In [None]:
rfe = RFE(rfc, n_features_to_select=100)
rfe.fit(X_train_pca, y_train)
selected_featuresRFE_reg = X.columns[rfe.support_]
print('Selected Features:', len(selected_featuresRFE_reg))
print(selected_featuresRFE_reg)

In [None]:
# Define LightGBM parameters
hyper  = {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300, 'num_leaves': 50}

# Create and fit a LightGBM model
light = lgb.LGBMClassifier(**hyper)
light.fit(X_train_pca, y_train)

# Feature selection using SelectFromModel
threshold = 'median'
feature_selector = SelectFromModel(light, threshold=threshold)
model = feature_selector.fit(X_train_pca, y_train)

# Get the selected features
X_train_selected = model.transform(X_train_pca)
selected_features_mask = feature_selector.get_support()

# Get feature importances
feature_importance = light.feature_importances_
feature_names = X.columns
# Create a DataFrame for feature importances
print(feature_names, feature_importance)