In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import ace_tools as tools

ModuleNotFoundError: No module named 'ace_tools'

In [None]:


# # 1. Parse info_about_data.txt for category average revisions
# with open('/mnt/data/info_about_data.txt', 'r') as f:
#     lines = f.read().splitlines()
#
# cat_lines = [ln for ln in lines if re.match(r'^\d+\s+.+\s+\d+\.\d+$', ln)]
# cats = []
# for ln in cat_lines:
#     parts = ln.split()
#     avg = float(parts[-1])
#     name = ' '.join(parts[1:-1])
#     cats.append({'category': name, 'avg_revisions_per_article': avg})
# df_cats = pd.DataFrame(cats)
# tools.display_dataframe_to_user('Category average revisions', df_cats)

# 2. Load the revision-level DataFrame (update path as needed)
df = pd.read_parquet('/python_code/everything100percat.csv')

# 3. Identify numeric feature columns (excluding metadata/int identifiers)
meta_cols = ['Unnamed: 0', 'snapshot_ts', 'rev_id', 'timestamp', 'user',
             'is_bot', 'content', 'article_id', 'title', 'root', 'stratum', 'plain_text']
feature_cols = [c for c in df.columns if df[c].dtype in [np.float64, np.int64] and c not in meta_cols]

# 4. Descriptive statistics for all features
desc_stats = df[feature_cols].describe().T
tools.display_dataframe_to_user('Feature descriptive statistics', desc_stats)

# 5. Correlation heatmap of features
corr = df[feature_cols].corr()
plt.figure(figsize=(12, 10))
plt.imshow(corr, cmap='coolwarm', aspect='auto')
plt.colorbar(label='Correlation')
plt.xticks(range(len(feature_cols)), feature_cols, rotation=90)
plt.yticks(range(len(feature_cols)), feature_cols)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# 6. PCA for dimensionality reduction
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[feature_cols])
pca = PCA(n_components=5, random_state=42)
X_pca = pca.fit_transform(X_scaled)
loadings = pd.DataFrame(pca.components_.T,
                        index=feature_cols,
                        columns=[f'PC{i+1}' for i in range(5)])
explained_var = pd.Series(pca.explained_variance_ratio_,
                          index=[f'PC{i+1}' for i in range(5)])

tools.display_dataframe_to_user('PCA Loadings', loadings)
tools.display_dataframe_to_user('PCA Explained Variance Ratio', explained_var)

# 7. t-SNE embedding for 2D visualization
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
tools.display_dataframe_to_user('t-SNE Embedding (first 1000 points)', df_tsne.head(1000))

plt.figure(figsize=(8,6))
plt.scatter(df_tsne['TSNE1'], df_tsne['TSNE2'], s=5, alpha=0.6)
plt.title('t-SNE 2D Embedding of Revisions')
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.tight_layout()
plt.show()
