In [1]:
import pandas as pd

In [2]:
# Load data
df = pd.read_csv("all_merged_sw.csv", 
                 usecols=["post_id", "author_id", "ticker",
                          "compound_score", "post_type", "author_trust",
                          "trustworthy"
                         ])

print(len(df.index))
print("The number of unique posts is: ", len(df['post_id'].unique()))

940785
The number of unique posts is:  484171


In [3]:
# Get number of submissions
print("The number of submissions is :", len(df[df['post_type'] == "submission"].index))
print("The number of unique submissions is :", 
      len(df[df['post_type'] == "submission"].drop_duplicates(subset=["post_id"])))

# Get number of comments
print("The number of comments is :", len(df[df['post_type'] == "comment"].index))
print("The number of unique comments is :", 
      len(df[df['post_type'] == "comment"].drop_duplicates(subset=["post_id"])))

The number of submissions is : 40882
The number of unique submissions is : 18310
The number of comments is : 899903
The number of unique comments is : 465861


In [4]:
# Get number of authors
print("The number of unique authors is :", len(df['author_id'].unique()))

# Get number of tickers
print("The number of unique tickers is :", len(df['ticker'].unique()))

The number of unique authors is : 67092
The number of unique tickers is : 2371


In [5]:
# Group by author_id
grouped = df.groupby('author_id')

# Get the size of each group
size = grouped.size()

# Basic statistics
print("The number of authors with only one post is: ", len(size[size == 1].index))
print("The percentage of authors with only one post is: ", len(size[size == 1].index)/len(df['post_id'].unique()))

The number of authors with only one post is:  268
The percentage of authors with only one post is:  0.0005535234452290616


In [6]:
# Print basic statistics
print("Number of unique users is: ", len(df['author_id'].unique()), "\n")

print("Number of users with 0% trust: ", len(df[df['author_trust'] == 0]['author_id'].unique()))
print("Number of users with 0-24.99% trust: ", len(df[(0 < df['author_trust']) & (df['author_trust'] < 0.25)]['author_id'].unique()))
print("Number of users with 25-49.99% trust: ", len(df[(0.25 <= df['author_trust']) & (df['author_trust'] < 0.5)]['author_id'].unique()))
print("Number of users with 50-74.99% trust: ", len(df[(0.5 <= df['author_trust']) & (df['author_trust'] < 0.75)]['author_id'].unique()))
print("Number of users with 75-99.99% trust: ", len(df[(0.75 <= df['author_trust']) & (df['author_trust'] < 1)]['author_id'].unique()))
print("Number of users with 100% trust: ", len(df[df['author_trust'] == 1]['author_id'].unique()), "\n")

print("% of users with 0% trust: ", len(df[df['author_trust'] == 0]['author_id'].unique())/len(df['author_id'].unique())*100)
print("% of users with 0-24.99% trust: ", len(df[(0 < df['author_trust']) & (df['author_trust'] < 0.25)]['author_id'].unique())/len(df['author_id'].unique())*100)
print("% of users with 25-49.99% trust: ", len(df[(0.25 <= df['author_trust']) & (df['author_trust'] < 0.5)]['author_id'].unique())/len(df['author_id'].unique())*100)
print("% of users with 50-74.99% trust: ", len(df[(0.5 <= df['author_trust']) & (df['author_trust'] < 0.75)]['author_id'].unique())/len(df['author_id'].unique())*100)
print("% of users with 75-99.99% trust: ", len(df[(0.75 <= df['author_trust']) & (df['author_trust'] < 1)]['author_id'].unique())/len(df['author_id'].unique())*100)
print("% of users with 100% trust: ", len(df[df['author_trust'] == 1]['author_id'].unique())/len(df['author_id'].unique())*100)

Number of unique users is:  67092 

Number of users with 0% trust:  13507
Number of users with 0-24.99% trust:  6368
Number of users with 25-49.99% trust:  18396
Number of users with 50-74.99% trust:  18136
Number of users with 75-99.99% trust:  3910
Number of users with 100% trust:  6775 

% of users with 0% trust:  20.132057473320216
% of users with 0-24.99% trust:  9.491444583556907
% of users with 25-49.99% trust:  27.419066356644606
% of users with 50-74.99% trust:  27.03153878256722
% of users with 75-99.99% trust:  5.827818517856078
% of users with 100% trust:  10.09807428605497


# Trust Filter

In [7]:
df = df[(df["author_trust"] >= 0.8) & (df["trustworthy"] == 1)]

print(len(df.index))
print("The number of unique posts is: ", len(df['post_id'].unique()))

36578
The number of unique posts is:  15577


In [8]:
# Get number of submissions
print("The number of submissions is :", len(df[df['post_type'] == "submission"].index))
print("The number of unique submissions is :", 
      len(df[df['post_type'] == "submission"].drop_duplicates(subset=["post_id"])))

# Get number of comments
print("The number of comments is :", len(df[df['post_type'] == "comment"].index))
print("The number of unique comments is :", 
      len(df[df['post_type'] == "comment"].drop_duplicates(subset=["post_id"])))

The number of submissions is : 819
The number of unique submissions is : 545
The number of comments is : 35759
The number of unique comments is : 15032


In [9]:
# Get number of authors
print("The number of unique authors is :", len(df['author_id'].unique()))

# Get number of tickers
print("The number of unique tickers is :", len(df['ticker'].unique()))

The number of unique authors is : 8930
The number of unique tickers is : 1421


In [10]:
# Group by author_id
grouped = df.groupby('author_id')

# Get the size of each group
size = grouped.size()

# Basic statistics
print("The number of authors with only one post is: ", len(size[size == 1].index))
print("The percentage of authors with only one post is: ", len(size[size == 1].index)/len(df['post_id'].unique()))

The number of authors with only one post is:  2
The percentage of authors with only one post is:  0.0001283944276818386


In [12]:
# Print basic statistics
print("Number of unique users is: ", len(df['author_id'].unique()), "\n")

print("Number of users with 75-99.99% trust: ", len(df[(0.75 <= df['author_trust']) & (df['author_trust'] < 1)]['author_id'].unique()))
print("Number of users with 100% trust: ", len(df[df['author_trust'] == 1]['author_id'].unique()), "\n")

print("% of users with 75-99.99% trust: ", len(df[(0.75 <= df['author_trust']) & (df['author_trust'] < 1)]['author_id'].unique())/len(df['author_id'].unique())*100)
print("% of users with 100% trust: ", len(df[df['author_trust'] == 1]['author_id'].unique())/len(df['author_id'].unique())*100)

Number of unique users is:  8930 

Number of users with 75-99.99% trust:  2155
Number of users with 100% trust:  6775 

% of users with 75-99.99% trust:  24.132138857782756
% of users with 100% trust:  75.86786114221724
