# PCA Analysis

This notebook performs Principal Component Analysis (PCA) on the processed data extracted from PDF files. It includes steps for loading the data, normalizing it, performing PCA, and visualizing the results.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the processed data
data_path = '../src/processed_data.csv'  # Update with the correct path to your processed data
df = pd.read_csv(data_path)

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Normalize the data
features = df.columns[1:]  # Assuming the first column is the country or identifier
x = df[features].values
x = StandardScaler().fit_transform(x)  # Standardize the data

# Perform PCA
pca = PCA(n_components=2)  # Change the number of components as needed
principal_components = pca.fit_transform(x)

# Create a DataFrame with the principal components
principal_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
final_df = pd.concat([principal_df, df[['country']]], axis=1)  # Assuming 'country' is the identifier

# Display the principal components DataFrame
final_df.head()

In [None]:
# Visualize the explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))
plt.grid()
plt.show()

In [None]:
# Scatter plot of the principal components
plt.figure(figsize=(10, 6))
plt.scatter(final_df['Principal Component 1'], final_df['Principal Component 2'], alpha=0.7)
for i, txt in enumerate(final_df['country']):
    plt.annotate(txt, (final_df['Principal Component 1'][i], final_df['Principal Component 2'][i]), fontsize=8)
plt.title('PCA of Extracted Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid()
plt.show()