In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import joblib

# Load the processed dataset with all features
data_with_features = pd.read_csv('processed_data_with_features.csv')

# Define the list of features to use for PCA
features_to_use = [
    'word_count', 'char_count', 'sentence_count',
    'NOUN', 'VERB', 'ADJ', 'ADV',
    'readability_score', 'ttr', 'lexical_density',
    'avg_word_length', 'stop_word_ratio', 'pos_diversity',
    'unique_word_ratio', 'word_entropy', 'gunning_fog', 'smog_index',
    'bigram_count', 'trigram_count', 'semantic_density', 'avg_sentence_complexity', 'burstiness'

]
# Separate features (X) and target labels (y)
X = data_with_features[features_to_use]
y = data_with_features['label']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA without specifying components to get all components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Plot cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
plt.legend()
plt.grid()
plt.show()

# Print the number of components to retain at 95% explained variance
optimal_components = np.argmax(cumulative_variance >= 0.95) + 1  # Add 1 because index starts at 0
print(f"Optimal number of components to retain 95% explained variance: {optimal_components}")

In [None]:
# Get the principal component loadings
loadings = pca.components_
feature_names = X.columns
loading_matrix = pd.DataFrame(loadings.T, columns=[f"PC{i+1}" for i in range(loadings.shape[0])], index=feature_names)

# Calculate feature importance as the sum of absolute contributions to the retained components
feature_importance = loading_matrix.iloc[:, :optimal_components].abs().sum(axis=1).sort_values(ascending=False)

# Print feature importance
print("\nFeature importance based on contribution to top components:")
print(feature_importance)

# Identify features with minimal contributions
low_contributing_features = feature_importance[feature_importance < feature_importance.mean()].index.tolist()
print(f"\nFeatures with minimal contributions to top {optimal_components} components: {low_contributing_features}")


In [None]:
# Update feature list by removing low-contributing features
updated_features = [feature for feature in features_to_use if feature not in low_contributing_features]


In [None]:
# Use the updated feature list
X_updated = data_with_features[updated_features]

# Standardize the updated features
X_scaled_updated = scaler.fit_transform(X_updated)

# Apply PCA with optimal components
pca_updated = PCA(n_components=optimal_components)
X_pca_updated = pca_updated.fit_transform(X_scaled_updated)

# Print updated explained variance
explained_variance_updated = pca_updated.explained_variance_ratio_
print(f"Updated explained variance by principal components: {explained_variance_updated}")
print(f"Total updated explained variance: {np.sum(explained_variance_updated)}")


In [None]:
import joblib

# Save the PCA-transformed data
X_pca_df = pd.DataFrame(X_pca_updated, columns=[f"PC{i+1}" for i in range(X_pca_updated.shape[1])])
X_pca_df['label'] = y
X_pca_df.to_csv('pca_transformed_data.csv', index=False)
print("PCA-transformed data saved as 'pca_transformed_data.csv'.")

# Save the PCA model and scaler
joblib.dump(pca_updated, 'updated_pca_model.pkl')
joblib.dump(scaler, 'updated_scaler.pkl')
print("PCA model and scaler saved.")


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib
import matplotlib.pyplot as plt

# Step 1: Load the processed dataset with all features
data_with_features = pd.read_csv('processed_data_with_features.csv')

# Define the list of features to use for PCA
features_to_use = [
    'word_count', 'char_count', 'sentence_count',
    'NOUN', 'VERB', 'ADJ', 'ADV',
    'readability_score', 'ttr', 'lexical_density',
    'avg_word_length', 'stop_word_ratio', 'pos_diversity',
    'unique_word_ratio', 'word_entropy', 'gunning_fog', 'smog_index',
    'bigram_count', 'trigram_count'
]

# Separate features (X) and target labels (y)
X = data_with_features[features_to_use]
y = data_with_features['label']

# Step 2: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Apply PCA to reduce to the top 2 principal components
pca = PCA(n_components=2)
X_pca_top2 = pca.fit_transform(X_scaled)

# Explained variance by the top 2 components
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by top 2 principal components: {explained_variance}")
print(f"Total explained variance (top 2 components): {np.sum(explained_variance):.4f}")

# Step 4: Analyze Feature Contributions to the Top 2 Components
# Get the principal component loadings
loadings = pca.components_
feature_names = X.columns
pc_loadings = pd.DataFrame(loadings.T, columns=['PC1', 'PC2'], index=feature_names)

# Identify the top contributing features for PC1 and PC2
top_features_pc1 = pc_loadings['PC1'].abs().sort_values(ascending=False).head(5)
top_features_pc2 = pc_loadings['PC2'].abs().sort_values(ascending=False).head(5)

print("\nTop contributing features to PC1:")
print(top_features_pc1)

print("\nTop contributing features to PC2:")
print(top_features_pc2)

# Step 5: Save PCA-transformed data (top 2 components)
X_pca_top2_df = pd.DataFrame(X_pca_top2, columns=['PC1', 'PC2'])
X_pca_top2_df['label'] = y
X_pca_top2_df.to_csv('pca_top2_transformed_data.csv', index=False)
print("PCA-transformed data (top 2 components) saved as 'pca_top2_transformed_data.csv'.")

# Step 6: Save the PCA model and scaler
joblib.dump(pca, 'pca_top2_model.pkl')
joblib.dump(scaler, 'scaler_top2.pkl')
print("PCA model and scaler for top 2 components saved.")

# Step 7: Visualize the top 2 PCA components
plt.figure(figsize=(10, 7))
scatter = plt.scatter(X_pca_top2[:, 0], X_pca_top2[:, 1], c=y, cmap='coolwarm', alpha=0.6)
plt.colorbar(scatter, label='Label')
plt.title('Top 2 PCA Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid()
plt.savefig('pca_top2_plot.png')
plt.show()
