In [1]:
import pandas as pd
df = pd.read_pickle("Main_Director_COMPLETE.pkl")

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
skillset_cols = [
    'Academic','combined_manufacturing_supply_chain',
    'combined_sustainability', 'combined_entrepreneurial', 'combined_compensation',
    'combined_governance', 'combined_government_policy', 'combined_international',
    'combined_legal','combined_leadership_outside_board',
    'combined_marketing', 'combined_risk_management', 'combined_scientific',
    'combined_strategic_planning', 'combined_conglomerate_experience',
    'combined_hr', 'combined_technology', 'combined_finance_accounting','Company Business'
]
X = df[skillset_cols].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_scaled)

plt.figure(figsize=(10, 5))
sns.lineplot(
    x=range(1, len(pca_full.explained_variance_ratio_) + 1),
    y=pca_full.explained_variance_ratio_,
    marker="o"
)
plt.title("Scree Plot - Explained Variance by Principal Component")
plt.xlabel("Principal Component")
plt.ylabel("Proportion of Variance Explained")
plt.xticks(range(1, len(pca_full.explained_variance_ratio_) + 1))
plt.grid(True)
plt.tight_layout()
plt.show()

pca = PCA(n_components=1)
principal_component = pca.fit_transform(X_scaled)

df['SkillsetIndex'] = principal_component[:, 0]
df['SkillsetIndex'] = (df['SkillsetIndex'] - df['SkillsetIndex'].mean()) / df['SkillsetIndex'].std()

median_value = df['SkillsetIndex'].median()
df['SkillsetGeneralistDummy'] = (df['SkillsetIndex'] > median_value).astype(int)

loadings = pd.DataFrame(
    pca_full.components_.T,
    columns=[f'PC{i+1}' for i in range(len(skillset_cols))],
    index=skillset_cols
)

KeyError: "['combined_leadership_outside_board'] not in index"

In [None]:
pc1_loadings = loadings[['PC1']].copy()
pc1_loadings['Skillset Feature'] = pc1_loadings.index
pc1_sorted = pc1_loadings.sort_values('PC1', ascending=False)


In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='PC1', y='Skillset Feature', data=pc1_sorted, palette='coolwarm')
plt.title('PC1 Feature Loadings', fontsize=14)
plt.xlabel('Loading Value')
plt.ylabel('Skillset Feature')
plt.axvline(0, color='black', linewidth=0.8)
plt.tight_layout()
plt.show()


In [None]:
eigenvalue_pc1 = pca_full.explained_variance_[0]
explained_variance_ratio_pc1 = pca_full.explained_variance_ratio_[0]
cumulative_explained_variance_pc1 = pca_full.explained_variance_ratio_.cumsum()[0]

print(f"Eigenvalue for PC1: {eigenvalue_pc1:.3f}")
print(f"Explained Variance Ratio for PC1: {explained_variance_ratio_pc1 * 100:.2f}%")
print(f"Cumulative Explained Variance Ratio up to PC1: {cumulative_explained_variance_pc1 * 100:.2f}%")

loadings_pc1 = loadings['PC1'].sort_values(key=lambda x: abs(x), ascending=False)
print("\nTop PC1 Loadings:")
print(loadings_pc1.round(3).head(21))


In [None]:
eigenvalues_df = pd.DataFrame(
    {'Principal Component': [f'PC{i+1}' for i in range(len(skillset_cols))],
     'Eigenvalue': pca_full.explained_variance_}
)

explained_variance_ratio_df = pd.DataFrame(
    {'Principal Component': [f'PC{i+1}' for i in range(len(skillset_cols))],
     'Explained Variance Ratio (%)': pca_full.explained_variance_ratio_ * 100}
)

cumulative_explained_variance_df = pd.DataFrame(
    {'Principal Component': [f'PC{i+1}' for i in range(len(skillset_cols))],
     'Cumulative Explained Variance Ratio (%)': pca_full.explained_variance_ratio_.cumsum() * 100}
)

loadings_df = pd.DataFrame(
    pca_full.components_.T,
    columns=[f'PC{i+1}' for i in range(len(skillset_cols))],
    index=skillset_cols
)

pca = PCA(n_components=1)
principal_component = pca.fit_transform(X_scaled)

df['SkillsetIndex'] = principal_component[:, 0]
df['SkillsetIndex'] = (df['SkillsetIndex'] - df['SkillsetIndex'].mean()) / df['SkillsetIndex'].std()

median_value = df['SkillsetIndex'].median()
df['SkillsetGeneralistDummy'] = (df['SkillsetIndex'] > median_value).astype(int)

In [None]:
eigenvalues_df 

In [None]:
explained_variance_ratio_df

In [None]:
cumulative_explained_variance_df

In [None]:
factor_scores = pd.DataFrame(
    X_pca_full,
    columns=[f'PC{i+1}' for i in range(X_pca_full.shape[1])],
    index=df.index 
)

df['PC1_FactorScore'] = factor_scores['PC1']


In [None]:
df['PC1_FactorScore'].to_csv("Factor_score.csv")

In [None]:
df['PC1_FactorScore']

In [None]:
df.count()

In [None]:
df['PC1_FactorScore'].describe()


In [None]:
df.describe()

In [None]:
df

In [None]:
df['PC1_FactorScore'].mean()



In [None]:
df['PC1_FactorScore'].std()

In [None]:
df['PC1_FactorScore_Standardised'] = (
    df['PC1_FactorScore'] - df['PC1_FactorScore'].mean()
) / df['PC1_FactorScore'].std()


In [None]:
df['PC1_FactorScore_Standardised'].mean()
df['PC1_FactorScore_Standardised'].std()


In [None]:
df

In [None]:
df['PC1_FactorScore_Standardised'].describe()

In [None]:
df['SkillsetIndex'] = (df['SkillsetIndex'] - df['SkillsetIndex'].mean()) / df['SkillsetIndex'].std()


In [None]:
df['SkillsetIndex'] 

In [None]:
print(df.columns.tolist())

In [None]:
df = df.drop(columns=["Unnamed: 0"])


In [None]:
df.to_csv("Main_Director_Complete_Skillsets_PCA.csv")

In [None]:
df.head(20)

In [None]:
df2 = pd.read_csv("Main_Firm_Fin_Loc_with_Absence.csv")

In [None]:
df2

In [None]:
df2.columns.tolist()

In [None]:
df_avg = df.groupby(['Symbol', 'AsOnDate'])['PC1_FactorScore_Standardised'].mean().reset_index()

df_avg.rename(columns={'PC1_FactorScore_Standardised': 'FactorScore_avg'}, inplace=True)

df2 = df2.merge(df_avg, on=['Symbol', 'AsOnDate'], how='left')

In [None]:
df_avg.head(50)

In [None]:
df2.head(60)

In [None]:
df2['FactorScore_avg'].describe()

In [None]:
df2.describe()

In [None]:
df_avg['FactorScore_avg'].describe()

In [None]:
df2.columns.tolist()

In [None]:

df2['AsOnYear'] = pd.to_numeric(df2['AsOnYear'], errors='coerce')

df2_filtered = df2[df2['AsOnYear'] >= 2015]




In [None]:
df2_filtered

In [None]:
df2_filtered.describe()

In [None]:
df2_filtered['FactorScore_avg'].describe()

In [None]:
firm_year_count = df2_filtered[['Company', 'AsOnYear']].drop_duplicates().shape[0]
print("Unique firm-year combinations:", firm_year_count)


In [None]:
unique_firm_years = df2_filtered[['Company', 'AsOnYear', 'FactorScore_avg']].drop_duplicates(subset=['Company', 'AsOnYear'])

unique_firm_years['FactorScore_avg'].describe()

