In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load the USEEIO dataset (replace 'path_to_your_dataset.csv' with the actual path)
df = pd.read_csv('useeio.csv')

# Assuming the target variable (class label) is named 'target'
# Replace 'target' with the actual name of the target column in your dataset
target = 'target'

# Standardization of Features
features = df.drop(columns=[target]).select_dtypes(include=[float, int]).columns
x = df[features].values
y = df[target].values

# Standardizing the features
scaler = StandardScaler()
x_standardized = scaler.fit_transform(x)

# Applying SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_standardized, y)

# Checking the class distribution after applying SMOTE
print("Class distribution before SMOTE:")
print(df[target].value_counts())
print("Class distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())

# Converting resampled data back to DataFrame for further processing
df_resampled = pd.DataFrame(data=x_resampled, columns=features)
df_resampled[target] = y_resampled

# Applying PCA
pca = PCA()
principal_components = pca.fit_transform(x_resampled)

# Creating a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(len(features))])
pca_df[target] = y_resampled

# Variance Explained by Each Principal Component
explained_variance_ratio = pca.explained_variance_ratio_

# Plotting the explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio by Principal Components')
plt.show()

# Selection of Principal Components
# Let's say we want to retain 95% of the variance
cumulative_explained_variance = explained_variance_ratio.cumsum()
n_components = next(i for i, cumulative_variance in enumerate(cumulative_explained_variance) if cumulative_variance >= 0.95) + 1

print(f'Number of components selected to retain 95% variance: {n_components}')

# Projection onto Selected Principal Components
pca = PCA(n_components=n_components)
x_reduced = pca.fit_transform(x_resampled)

# Creating a DataFrame with the reduced dimensions
reduced_df = pd.DataFrame(data=x_reduced, columns=[f'PC{i+1}' for i in range(n_components)])
reduced_df[target] = y_resampled

# Displaying the reduced DataFrame
print(reduced_df.head())
