Importing Datasets

In [None]:
import pandas as pd

pendle_llm = pd.read_csv("pendle_llm_analysis.csv")
tweet_df = pd.read_json("pendle.json")

author_df = tweet_df["author"].apply(pd.Series)
unique_authors_df = author_df.drop_duplicates(subset=['userName'])

In [None]:
pendle_llm.columns

In [None]:
len(unique_authors_df)

Data Understanding 

In [None]:
missing_values = pendle_llm.isna().sum()
missing_summary = missing_values[missing_values > 0].to_dict()

if missing_summary:
    print(missing_summary)
else:
    print("No missing values found")

Feature Engineering

In [None]:
tweet_df.head()

In [None]:
unique_authors_df.head()

In [None]:
tweet_analysis_merged = pd.merge(tweet_df,pendle_llm,on="id")

In [None]:
tweet_analysis_merged.head()

In [None]:
tweet_analysis_merged.to_csv("merged_tweets.csv")

In [None]:
unique_authors_df.head()

In [None]:
def extract_username(author_obj):
    try:
        return author_obj.get("userName", None)  # Extract userName
    except (ValueError, SyntaxError):
        return None  # Return None if parsing fails


tweet_analysis_merged["userName"] = tweet_analysis_merged["author"].apply(extract_username)

In [None]:
tweet_analysis_merged.head()

In [None]:
tweet_analysis_merged.head()

Statistical Vs Emotional Ratio

In [None]:
import numpy as np

def compute_author_ratios(df):
    # Group by userName and count tweet types
    tweet_counts = df.groupby("userName")["tweet_type"].value_counts().unstack(fill_value=0)

    tweet_counts["total_tweets"] = tweet_counts.sum(axis=1)

    tweet_counts["stat_vs_emot_ratio"] = tweet_counts["statistical"] / tweet_counts["total_tweets"]

    return tweet_counts.reset_index()

author_stat_ratios_df = compute_author_ratios(tweet_analysis_merged)

print(author_stat_ratios_df.head())

In [None]:
author_stat_ratios_df.head()

Historical Comparison Presence Ratio

In [None]:
import numpy as np

def compute_author_ratios(df):
    # Group by userName and count tweet types
    tweet_counts = df.groupby("userName")["historical_comparison"].value_counts().unstack(fill_value=0)

    tweet_counts["total_tweets"] = tweet_counts.sum(axis=1)

    tweet_counts["historical_comparison_ratio"] = tweet_counts["present"] / tweet_counts["total_tweets"]

    return tweet_counts.reset_index()

author_hist_ratios_df = compute_author_ratios(tweet_analysis_merged)

author_hist_ratios_df.head()

Market Hint Classification Ratio

In [None]:
import numpy as np

def compute_author_ratios(df):
    # Group by userName and count tweet types
    tweet_counts = df.groupby("userName")["market_hint"].value_counts().unstack(fill_value=0)

    tweet_counts["total_tweets"] = tweet_counts.sum(axis=1)

    tweet_counts["market_hint_ratio"] = tweet_counts["signal"] / tweet_counts["total_tweets"]

    return tweet_counts.reset_index()

author_hint_ratios_df = compute_author_ratios(tweet_analysis_merged)

author_hint_ratios_df

Categorical Ratios

In [None]:
# Define categorical columns
category_columns = ["signal_classification", "call_to_action", "hype_classification", "urgency_level"]

# Optimized approach: Compute category ratios separately and merge results
category_ratios_list = []

for col in category_columns:
    temp_df = tweet_analysis_merged.groupby("userName")[col].value_counts(normalize=True).unstack(fill_value=0)
    temp_df.columns = [f"{col}_{val}_ratio" for val in temp_df.columns]
    category_ratios_list.append(temp_df)

# Merge all computed ratios
final_ratios = pd.concat(category_ratios_list, axis=1).reset_index()

# Display the first few rows
final_ratios.head()

In [None]:
# Compute weighted scores for each category separately
final_ratios["signal_classification_score"] = (
    final_ratios["signal_classification_bearish_ratio"] * 0 +
    final_ratios["signal_classification_bullish_ratio"] * 1 +
    final_ratios["signal_classification_normal_ratio"] * 0.5
)

final_ratios["call_to_action_score"] = (
    final_ratios["call_to_action_buy_ratio"] * 1 +
    final_ratios["call_to_action_sell_ratio"] * 0.2 +
    final_ratios["call_to_action_none_ratio"] * 0 +
    final_ratios["call_to_action_hold_ratio"] * 0.5
)

final_ratios["hype_classification_score"] = (
    final_ratios["hype_classification_high_ratio"] * 0 +
    final_ratios["hype_classification_normal_ratio"] * 0.5 +
    final_ratios["hype_classification_low_ratio"] * 1
)

final_ratios["urgency_level_score"] = (
    final_ratios["urgency_level_high_ratio"] * 1 +
    final_ratios["urgency_level_medium_ratio"] * 0.3 +
    final_ratios["urgency_level_low_ratio"] * 0
)

# Display the updated dataframe with separate scores
final_ratios[["userName", "signal_classification_score", "call_to_action_score", 
              "hype_classification_score", "urgency_level_score"]].head()

In [None]:
final_ratios["hype_classification_score"].describe()

Crypto Manipulative Words

In [None]:
# Count occurrences of False in the 'crypto_manipulative_words' column per user
false_counts = tweet_analysis_merged.groupby("userName")["crypto_manipulative_words"].apply(lambda x: (x == False).sum()).reset_index()

# Rename the column for clarity
false_counts.rename(columns={"crypto_manipulative_words": "false_count_crypto_manipulative_words"}, inplace=True)

# Normalize false_count_crypto_manipulative_words (Min-Max Scaling)
min_val = false_counts["false_count_crypto_manipulative_words"].min()
max_val = false_counts["false_count_crypto_manipulative_words"].max()

# Avoid division by zero if all values are the same
if min_val != max_val:
    false_counts["normalized_false_count"] = (false_counts["false_count_crypto_manipulative_words"] - min_val) / (max_val - min_val)
else:
    false_counts["normalized_false_count"] = 1  # If all values are the same, set them to 1

# Display the first few rows
false_counts.head()

Weightage System Implementation

In [None]:
weighted_df = pd.DataFrame()

weighted_df["username"] = author_stat_ratios_df["userName"]
weighted_df["data_driven_content"] = author_stat_ratios_df["stat_vs_emot_ratio"] * 0.6 + author_hist_ratios_df["historical_comparison_ratio"] * 0.4
weighted_df["signal_clarity"] = author_hint_ratios_df["market_hint_ratio"] * 0.3 + final_ratios["signal_classification_score"] * 0.4 + final_ratios["call_to_action_score"] * 0.3
weighted_df["manipulative_resistance"] = false_counts["normalized_false_count"] * 0.5 + final_ratios["hype_classification_score"] * 0.5
weighted_df["urgency_sanity_check"] = final_ratios['urgency_level_score']

weighted_df["signal_quality"] = weighted_df["data_driven_content"] * 0.3 + weighted_df["signal_clarity"] * 0.35 + weighted_df["manipulative_resistance"] * 0.3 + weighted_df["urgency_sanity_check"] * 0.05

Ranking of Top 5 UserNames

In [None]:
weighted_df.sort_values(ascending=False,by=["signal_quality"]).head()

In [None]:
weighted_df["signal_quality"].describe()