Library

In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
import seaborn as sns

In [None]:
file_path = 'cleaned_data.csv'
df = pd.read_csv(file_path)
print(df)

In [None]:
feature_names = df.columns.tolist()

print("Feature names:")
print(feature_names)

In [None]:
threshold_column = 'dailyNumberOfTransactions'
plt.hist(df[threshold_column], bins=50, color='blue', alpha=0.7)
plt.title(f'Distribution of {threshold_column}')
plt.xlabel(threshold_column)
plt.ylabel('Frequency')
plt.show()

In [None]:
threshold_column = 'balanceInUSD'
plt.hist(df[threshold_column], bins=50, color='blue', alpha=0.7)
plt.title(f'Distribution of {threshold_column}')
plt.xlabel(threshold_column)
plt.ylabel('Frequency')
plt.show()


In [None]:
threshold_column = 'borrowInUSD'
plt.hist(df[threshold_column], bins=50, color='blue', alpha=0.7)
plt.title(f'Distribution of {threshold_column}')
plt.xlabel(threshold_column)
plt.ylabel('Frequency')
plt.show()

Prepare for cluster

In [None]:
# Feature Selection
features = df[['balanceInUSD', 'borrowInUSD', 'dailyNumberOfTransactions']]

# Feature Normalization
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Check for correlations
print("Correlation matrix:\n", features.corr())

# Dimensionality Reduction with PCA
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
features_reduced = pca.fit_transform(features_scaled)

Elbow Method

In [None]:
inertia = []
for k in range(1, 11):  # Test different numbers of clusters
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(features_reduced)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

Davies-Bouldin Index


In [None]:
range_n_clusters = list(range(2, 21))
db_scores = []

for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(features_reduced)

    # Compute the Davies-Bouldin score
    score = davies_bouldin_score(features_reduced, cluster_labels)
    db_scores.append(score)
    print("For n_clusters = {}, Davies-Bouldin Index is {})".format(n_clusters, score))

# Find the number of clusters with the lowest Davies-Bouldin score
optimal_clusters = range_n_clusters[db_scores.index(min(db_scores))]
print("Optimal number of clusters:", optimal_clusters)

K-means


In [None]:
n_clusters = 6
kmeans = KMeans(n_clusters)
clusters = kmeans.fit_predict(features_reduced)

# Add the cluster information to the original DataFrame
df['group'] = clusters

# Count the number of communities
print("Number of wallets in each group:\n", df['group'].value_counts())

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(features_reduced[:, 0], features_reduced[:, 1], c=clusters, cmap='viridis')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.title('Wallet group with k=%d' % n_clusters)
plt.show()

plt.figure(figsize=(8, 6))
df['group'].value_counts().plot(kind='bar', color='blue', alpha=0.7)
plt.title('Distribution of group')
plt.xlabel('group')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

Whale wallet

In [None]:
# Calculate the mean values for balanceInUSD and dailyTransactionAmounts within each group
for group in df['group'].unique():
    group_data = df[df['group'] == group]
    mean_balance = group_data['balanceInUSD'].mean()
    mean_daily_transactions = group_data['dailyTransactionAmounts'].mean()

    # Update the 'is_whale' status within the group
    df.loc[df['group'] == group, 'is_whale'] = \
        (df['balanceInUSD'] > mean_balance) | (df['dailyTransactionAmounts'] > mean_daily_transactions)

# Calculate and plot the number of whale wallets in each group
whale_counts = df.groupby('group')['is_whale'].sum()

# Plotting
plt.figure(figsize=(10, 6))
whale_counts.plot(kind='bar', color='blue', alpha=0.7)

# Add labels and title
plt.xlabel('Group')
plt.ylabel('Number of Whale Wallets')
plt.title('Whale Wallets in Each Group')

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Financial Health

In [None]:
def financial_health_within_group(df):
    for group in df['group'].unique():
        group_filter = df['group'] == group
        balance_mean = df[group_filter]['balanceInUSD'].mean()
        borrow_mean = df[group_filter]['borrowInUSD'].mean()

        def categorize_financial_health(balance, borrow):
            if balance >= balance_mean and borrow <= borrow_mean:
                return 'Good Health'
            else:
                return 'At Risk'

        df.loc[group_filter, 'financial_health_status'] = df[group_filter].apply(
            lambda row: categorize_financial_health(row['balanceInUSD'], row['borrowInUSD']), axis=1)

financial_health_within_group(df)

# Visualize financial health status in each group
plt.figure(figsize=(10, 6))
sns.violinplot(x='group', y='financial_health_status', data=df)
plt.xlabel('Group')
plt.ylabel('Financial Health Status')
plt.title('Financial Health in Each Group')

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Activity Level

In [None]:
def activity_level_within_group(df):
    for group in df['group'].unique():
        group_filter = df['group'] == group
        transactions_mean = df[group_filter]['dailyNumberOfTransactions'].mean()
        # Thresholds
        level_1_threshold = transactions_mean * 0.5
        level_2_threshold = transactions_mean * 1.5

        def categorize_activity_level(x):
            if x <= level_1_threshold:
                return 'Level 1'
            elif x <= level_2_threshold:
                return 'Level 2'
            else:
                return 'Level 3'

        df.loc[group_filter, 'activity_level_status'] = df[group_filter]['dailyNumberOfTransactions'].apply(categorize_activity_level)

activity_level_within_group(df)

# Visualize activity level in each group
plt.figure(figsize=(10, 6))
for group in df['group'].unique():
    group_data = df[df['group'] == group]
    plt.hist(group_data['dailyNumberOfTransactions'], bins=20, alpha=0.5, label=f'Group {group}')

# Add labels and title
plt.xlabel('Daily Number of Transactions')
plt.ylabel('Frequency')
plt.title('Activity Level in Each Group')

# Add a legend
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

Token Diversity

In [None]:
def token_diversity_within_group(df):
    for group in df['group'].unique():
        group_filter = df['group'] == group
        tokens_mean = df[group_filter]['tokens'].mean()

        df.loc[group_filter, 'token_diversity_status'] = df[group_filter]['tokens'].apply(
            lambda x: 'High Diversity' if x >= tokens_mean else 'Low Diversity')

token_diversity_within_group(df)

# Visualize token diversity in each group
plt.figure(figsize=(10, 6))
for group in df['group'].unique():
    group_data = df[df['group'] == group]
    token_diversity_status = group_data['token_diversity_status'].value_counts()

    # Plot the bar chart for each group
    plt.bar(group, token_diversity_status.get('High Diversity', 0), label=f'High Diversity - Group {group}')
    plt.bar(group, token_diversity_status.get('Low Diversity', 0), label=f'Low Diversity - Group {group}', bottom=token_diversity_status.get('High Diversity', 0))

# Add labels and title
plt.xlabel('Group')
plt.ylabel('Number of Wallets')
plt.title('Token Diversity in Each Group')

# Add a legend
plt.legend()

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
columns_to_include = ['address', 'community', 'is_whale', 'financial_health_status', 'activity_level_status', 'token_diversity_status']
new_df = df[columns_to_include]

# output_csv_file = 'label wallet.csv'

# new_df.to_csv(output_csv_file, index=False)


plt.figure(figsize=(8, 6))
new_df['group'].value_counts().plot(kind='bar', color='blue', alpha=0.7)
plt.title('Distribution of Group')
plt.xlabel('group')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()