In [2]:
import pandas as pd
import numpy as np
import tqdm
from scipy import sparse
import torch
from torch import nn
from collections import Counter
from typing import *
import time
import logging
import altair as alt
import re
import numpy.linalg as la
import json
import Models
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from torch import nn
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import numpy.random
import spacy
import gensim
import gensim.downloader
import pickle
from collections import Counter
import LogisticRegression
from scipy.signal import find_peaks
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
alt.data_transformers.disable_max_rows
torch.manual_seed(0)
np.random.seed(0)

# 1. Data Processing

In [20]:
nlp = spacy.load('en_core_web_sm')
w2v = gensim.downloader.load('glove-wiki-gigaword-50')
df_sus_users = pd.read_csv("../Data/Network/NetworkUsers.csv", sep="\t", index_col="User_id")
def tokenize(text):
    tokens = nlp(" ".join([token.lower() for token in re.sub("\.|,|:|!|\?|-|\'", " ", text).split(" ") if token.isalpha()]))
    return [token.lemma_ for token in tokens]

def embed(tokens):
    def toVec(token):
        try:
            return w2v[token]
        except Exception:
            return w2v["unk"]
    return [toVec(token) for token in tokens]

In [None]:
%%time
def rate(rating): # 0 is false
    for kw in ["false", "fire", "flop", "barely"]:
        if kw in rating.lower():
            return 0
    return 1

def load_data():
    df = pd.read_csv("train.csv", sep="\t", index_col="index")
    df_mis = df[df.Misinformation==1]
    df_mis = df_mis.sample(int(0.9 * len(df_mis)))
    df_true = df[df.Misinformation==0].sample(int(0.9 * len(df_mis)))
    df = pd.concat([df_true, df_mis])
    df["Statement"] = df["Statement"].apply(tokenize)
    df["Embedding"] = df["Statement"].apply(embed)
    df_val = df[~df.Source]
    df_val = df_val.sample(int(len(df_val) * 0.5))
    return {"train": df[~df.index.isin(df_val.index)], "val": df_val}
data = load_data()

# 2. LSTM

In [None]:
def pipeline_ml(data, model: nn.Module, epochs: int=10, lr: float=1e-4):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    predictor = Models.Predictor(model, device)
    labels = torch.Tensor(data["train"].Misinformation.values)
    features = data["train"].Embedding.tolist()
    features = [torch.from_numpy(np.array(fea)) for fea in features]
    predictor.train(features, labels, epochs=epochs, lr=lr)
    return predictor

In [None]:
lstm = Models.LSTM()
predictor = pipeline_ml(data, lstm)

In [None]:
features = data["val"].Embedding.tolist()
features = [torch.from_numpy(np.array(fea)) for fea in features]
lstm_pred = predictor.predict(features) > 0.5

In [None]:
f1 = f1_score(data["val"].Misinformation, lstm_pred>0.5)
acc = accuracy_score(data["val"].Misinformation, lstm_pred>0.5)
prec = precision_score(data["val"].Misinformation, lstm_pred>0.5)
recall = recall_score(data["val"].Misinformation, lstm_pred>0.5)
lstm_res = {"f1": f1, "acc": acc, "prec": prec, "recall": recall}
lstm_res

In [None]:
lstm_pred

# 3. Logistic Regression

In [None]:
processor = LogisticRegression.Preprocessor()
processor.buildVocabulary(data["train"])
matrix = processor.buildMatrix(data["train"])

In [None]:
LR = LogisticRegression.LogisticRegression(processor)
LogisticRegression.train(matrix, data["train"]["Misinformation"].values, LR, lr=5e-4, epoch_num=10)

In [None]:
lr_pred = LogisticRegression.predict(LR, data["val"]) > 0.5

In [None]:
f1 = f1_score(data["val"].Misinformation, lr_pred>0.5)
acc = accuracy_score(data["val"].Misinformation, lr_pred>0.5)
prec = precision_score(data["val"].Misinformation, lr_pred>0.5)
recall = recall_score(data["val"].Misinformation, lr_pred>0.5)
lr_res = {"f1": f1, "acc": acc, "prec": prec, "recall": recall}
lr_res

# 4. Predict

In [None]:
with open("GovernorPred.pkl", "wb") as f:
    f.write(pickle.dumps({"LSTM": lstm, "LR": processor}))

In [3]:
with open("GovernorPred.pkl", "rb") as f:
    models = pickle.loads(f.read())

In [4]:
predictor = Models.Predictor(models["LSTM"])
LR = LogisticRegression.LogisticRegression(models["LR"])

# 5. Inspect

In [11]:
%%time
df_tweets = pd.read_csv("../Data/Candidates/GovernorTweets.csv", sep="\t", index_col="id")
df_tweets = df_tweets[df_tweets.content!=""].dropna()
govnor_tweets = df_tweets[df_tweets.credibility!=1]
govnor_tweets = govnor_tweets.sample(int(len(govnor_tweets) * 0.25))
label = (govnor_tweets["credibility"] <= 0).astype(int)

CPU times: user 2.79 s, sys: 260 ms, total: 3.05 s
Wall time: 3.04 s


In [11]:
lr_pred = LogisticRegression.predict(LR, govnor_tweets) > 0.5
f1 = f1_score(label, lr_pred>0.5)
acc = accuracy_score(label, lr_pred>0.5)
prec = precision_score(label, lr_pred>0.5)
recall = recall_score(label, lr_pred>0.5)
lr_res = {"f1": f1, "acc": acc, "prec": prec, "recall": recall}
lr_res

Build Matrix: 15304it [00:00, 52405.39it/s]


{'f1': 0.14535391785610255,
 'acc': 0.6165708311552536,
 'prec': 0.10185752194325373,
 'recall': 0.253685815963396}

In [12]:
%%time
features = govnor_tweets["content"].apply(tokenize).apply(embed).tolist()
features = [torch.from_numpy(np.array(fea)) for fea in features]
lstm_pred = predictor.predict(features) > 0.5
f1 = f1_score(label, lstm_pred>0.5)
acc = accuracy_score(label, lstm_pred>0.5)
prec = precision_score(label, lstm_pred>0.5)
recall = recall_score(label, lstm_pred>0.5)
lstm_res = {"f1": f1, "acc": acc, "prec": prec, "recall": recall}
lstm_res

  0%|          | 0/15304 [00:00<?, ?it/s]

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


CPU times: user 1min 58s, sys: 2.31 s, total: 2min
Wall time: 2min 1s


{'f1': 0.2208536059616924,
 'acc': 0.1733533716675379,
 'prec': 0.12564821303433776,
 'recall': 0.9115404168784952}

### Tweets from incredible source not classified as misinformation

In [None]:
# Misinformation
bad_tweets[lstm_pred].sample(10).index

In [None]:
df_tweets.loc[[1518296660329201665, 1499864294170808332, 1516862042984095746,
            1507126074882277381, 1518231318366244867, 1505706253888143362,
            1509284007808172036, 1506148809725943808, 1501767346943414272,
            1514341548892295183]].Statement.tolist()

### Tweets from incredible source not classified as misinformation

In [None]:
# True information
bad_tweets[~lstm_pred].sample(10).index

In [None]:
df_tweets.loc[[1502046989483184135, 1498757468578582537, 1513529952834101255,
            1502372547442077698, 1507041373421248514, 1510022055722467333,
            1499040381463265282, 1502059266777178114, 1514618674501627904,
            1503568426861838336]].Statement.tolist()

### Tweets from credible source not classified as misinformation

In [None]:
good_tweets[lstm_pred].sample(10).index

In [None]:
df_tweets.loc[[1514322102303674370, 1500967005230305284, 1514267702252220418,
            1515282667251634176, 1509875045677645828, 1499209915709239296,
            1511243530202664960, 1501166881423728642, 1505590605438763011,
            1500968967040602115]].Statement.tolist()

### Tweets from credible source classified as misinformation

In [None]:
good_tweets[~lstm_pred].sample(10).index

In [None]:
df_tweets.loc[[1499526834341490689, 1501715649046925315, 1512255934520864770,
            1510012772712652802, 1509660956653146113, 1510813035526565889,
            1517331816398548992, 1518945587479166976, 1514944616709115904,
            1506400714544893954]].Statement.tolist()

In [None]:
good_tweets[lstm_pred]["Statement"].apply(lambda x:len(x.split(" "))).describe()

In [None]:
good_tweets[~lstm_pred]["Statement"].apply(lambda x:len(x.split(" "))).describe()

# 6. Use model to detect peaks

In [None]:
def detect_peaks(governor: str, method: str="lstm", emotion: str="anger"):
    emotions = ["sadness", "joy", "love", "anger", "fear", "surprise"]
    emotion = emotions.index(emotion)
    if method == "lstm":
        features = df_gov["Statement"].apply(tokenize).apply(embed).tolist()
        features = [torch.from_numpy(np.array(fea)) for fea in features]
        lstm_pred = predictor.predict(features) > 0.5
        df_gov["pred"] = lstm_pred
    elif method == "lr":
        df_gov["pred"] = (LogisticRegression.predict(LR, df_gov) > 0.5).values
    elif method == "emotion":
        df_gov["pred"] = [res[emotion]["score"] for res in classifier(df_gov["Statement"].tolist())]
    return df_gov

In [None]:
%%time
res = detect_peaks('Lee Zeldin', "emotion")
res

In [None]:
# pct = res.groupby(["date"])["pred"].sum() / res.groupby(["date"])["pred"].count()
pct = res.groupby(["date"])["pred"].mean()
pct = (pct - pct.min()) / (pct.max() - pct.min()) * 100
def detect_peak(counts: pd.Series):
    prominence = 1.5 * (np.percentile(counts, 75) - np.percentile(counts, 25))
    peaks_indexes, _ = find_peaks(counts, prominence = prominence)
    return peaks_indexes
pct.iloc[detect_peak(pct)]

In [None]:
%%time
governors = ['Stacey Abrams', 'Josh Shapiro', 'Charlie Crist', 'Katie Hobbs', 'Lee Zeldin']
peaks = dict()
for governor in governors[-1:]:
    res = detect_peaks(governor, "lr")
    pct = res.groupby(["date"])["pred"].sum() / res.groupby(["date"])["pred"].count()
    pct = (pct - pct.min()) / (pct.max() - pct.min()) * 100
    def detect_peak(counts: pd.Series):
        prominence = 1.5 * (np.percentile(counts, 75) - np.percentile(counts, 25))
        peaks_indexes, _ = find_peaks(counts, prominence = prominence)
        return peaks_indexes
    peaks[governor] = pct.iloc[detect_peak(pct)]

In [None]:
pct.iloc[detect_peak(pct)]

# 7. Sentiment analysis

In [3]:
df_emotion = pd.read_csv("../Data/Candidates/GovernorTweets.csv", index_col="Date", sep="\t")
feature_cols = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
def load_sa_data(ratio: float=0.8, test_type: str="user"):
    if test_type == "user":
        df_sus = df_emotion[df_emotion.author_id.isin(df_sus_users.index)] 
    else:
        df_sus = df_emotion[df_emotion.credibility==0]
    df_clean = df_emotion[~df_emotion.index.isin(df_sus)]
    df_sus["label"], df_clean["label"] = 1, 0
    df_sus_train = df_sus.sample(frac=ratio)
    df_clean_train = df_clean.sample(frac=ratio)
    size = min(len(df_sus_train), len(df_clean_train))
    df_train = pd.concat([df_sus_train.sample(size), df_clean_train.sample(size)])
    df_val = pd.concat([df_sus, df_clean])
    df_val = df_val[~df_val.id.isin(df_train.id)]
    data = {"train": df_train[feature_cols + ["label"]], "val": df_val[feature_cols + ["label"]]}
    return data

In [21]:
df_emotion.Candidates.unique()

array(['Allen B. West', 'Andrew Giuliani', "Beto O'Rourke",
       'Bob Stefanowski', 'Brad Little', 'Brian Dahle', 'Brian Kemp',
       'Chad Prather', 'Charlie Crist', 'Dan Cox', 'Darren Bailey',
       'David Perdue', 'David White', 'Delilah Barrios', 'Doc Washburn',
       'Doug Mastriano', 'Greg Abbott', 'Heidi Ganahl', 'Henry McMaster',
       'J.B. Pritzker', 'Janet T. Mills', 'Jesse Sullivan', 'Jim Renacci',
       'Josh Shapiro', 'Joy Hofmeister', 'Jumaane Williams',
       'Kandiss Taylor', 'Kathy Hochul', 'Katie Hobbs', 'Kerry McQuisten',
       'Kevin Stitt', 'Kim Reynolds', 'Kristi L. Noem', 'Laura Kelly',
       'Lee Zeldin', 'Marc Thielman', 'Marco Lopez', 'Matt Brown',
       'Matt Salmon', 'Maura Healey', 'Michelle Lujan Grisham',
       'Nan Whaley', 'Ned Lamont', 'Nikki Fried', 'Paul Morgan',
       'Rebecca Dow', 'Rebecca Kleefisch', 'Richard Michael DeWine',
       'Rob Astorino', 'Ron DeSantis', 'Russell Diamond',
       'Sarah Huckabee Sanders', 'Sonia Chang-Diaz

In [9]:
import altair 

def detect_peak(counts: pd.Series, iqr: float = 1.5):
    if counts.empty:
        return list()
    prominence = iqr * (np.percentile(counts, 75) - np.percentile(counts, 25))
    peaks_indexes, _ = find_peaks(counts, prominence=prominence)
    return peaks_indexes

def plot_peak(df_counts: pd.DataFrame, field: str):
    df_counts["Counts"] = df_counts[field] / df_counts[field].max()
    df_counts = df_counts[["Counts", "Date"]]
    df_counts["Date"] = pd.to_datetime(df_counts["Date"].astype(str))
    chart = alt.Chart(df_counts).mark_line().encode(
        y=alt.Y("Counts:Q"),
        x=alt.X("Date:T"),
        tooltip=["Counts:Q", "Date:T"]
    )
    return chart

In [23]:
df_tmp = df_emotion[df_emotion["Candidates"]=="Greg Abbott"].groupby(["Date"])["anger"].mean().reset_index()
df_tmp.iloc[detect_peak(df_tmp["anger"])]

Unnamed: 0,Date,anger
5,20220306,0.270363
31,20220401,0.24934
38,20220408,0.308686
40,20220410,0.272355
45,20220415,0.301779


### Random Forest

In [9]:
%%time

# Test on suspicious users
data = load_sa_data()
rf = RandomForestClassifier()
rf.fit(data["train"][feature_cols], data["train"]["label"])
rf_pred = rf.predict(data["val"][feature_cols])
print(pd.Series(rf.feature_importances_, index=feature_cols))
Models.cal_metrics(rf_pred, data["val"]["label"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sus["label"], df_clean["label"] = 1, 0


anger       0.139366
disgust     0.152155
fear        0.144602
joy         0.144269
neutral     0.137333
sadness     0.140775
surprise    0.141500
dtype: float64
CPU times: user 52.6 s, sys: 2.89 s, total: 55.5 s
Wall time: 55.6 s


{'f1': 0.0026531852407052662,
 'acc': 0.6045953279569638,
 'prec': 0.0013296230589710648,
 'recall': 0.5820504421880118}

In [10]:
%%time

# Test on suspicious domain
data = load_sa_data(test_type="domain")
rf = RandomForestClassifier()
rf.fit(data["train"][feature_cols], data["train"]["label"])
rf_pred = rf.predict(data["val"][feature_cols])
print(pd.Series(rf.feature_importances_, index=feature_cols))
Models.cal_metrics(rf_pred, data["val"]["label"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sus["label"], df_clean["label"] = 1, 0


anger       0.143126
disgust     0.128496
fear        0.160929
joy         0.132691
neutral     0.125480
sadness     0.167031
surprise    0.142245
dtype: float64
CPU times: user 36.3 s, sys: 2.59 s, total: 38.9 s
Wall time: 38.9 s


{'f1': 0.00384708767557984,
 'acc': 0.8438570452063956,
 'prec': 0.0019289533634944632,
 'recall': 0.6859060402684564}

### Logistic Regression

In [19]:
def pipeline_sa(data, model: nn.Module, epochs: int=5, lr: float=1e-4):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    feature_cols = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
    predictor = Models.Predictor(model, device)
    labels = torch.Tensor(data["train"].label.values)
    features = [torch.from_numpy(fea) for fea in data["train"][feature_cols].values.astype(np.float32)]
    predictor.train(features, labels, epochs=epochs, lr=lr)
    return predictor

In [24]:
# Test on suspicious users
data = load_sa_data()
model = Models.LogisticRegression(7)
predictor = pipeline_sa(data=data, model=model)
df_val = data["val"].sample(frac=0.1)
lr_pred = predictor.predict(torch.from_numpy(df_val[feature_cols].values.astype(np.float32)))
Models.cal_metrics(lr_pred, df_val["label"])

  0%|          | 0/337873 [00:05<?, ?it/s]

{'f1': 0.0017919663133989404,
 'acc': 0.0008967866624441728,
 'prec': 0.0008967866624441728,
 'recall': 1.0}

In [25]:
# Test on suspicious domain
data = load_sa_data(test_type="domain")
model = Models.LogisticRegression(7)
predictor = pipeline_sa(data=data, model=model)
df_val = data["val"].sample(frac=0.1)
lr_pred = predictor.predict(torch.from_numpy(df_val[feature_cols].values.astype(np.float32)))
Models.cal_metrics(lr_pred, df_val["label"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sus["label"], df_clean["label"] = 1, 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/11958 [00:00<?, ?it/s]

  0%|          | 0/11958 [00:00<?, ?it/s]

  0%|          | 0/11958 [00:00<?, ?it/s]

  0%|          | 0/11958 [00:00<?, ?it/s]

  0%|          | 0/11958 [00:00<?, ?it/s]

  0%|          | 0/338965 [00:02<?, ?it/s]

{'f1': 0.0009505278938470862,
 'acc': 0.07597539568982048,
 'prec': 0.0004754975172008833,
 'recall': 0.9675324675324676}

In [4]:
df_tweets

Unnamed: 0_level_0,Date,public_metrics,text,created_at,entities,author_id,source,Candidates,method,credibility,content,anger,disgust,fear,joy,neutral,sadness,surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1498757068538322944,20220301,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",.@AllenWest spotting in northwest Tarrant Coun...,2022-03-01 20:28:25+00:00,"{'mentions': [{'start': 1, 'end': 11, 'usernam...",877928155989905408,Twitter for Android,Allen B. West,@,1,spotting in northwest Tarrant County,0.018387,0.012563,0.023698,0.011592,0.665249,0.059865,0.208646
1498757438773866498,20220301,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@SippingTeTSdana @TruAmericanGal1 @AllenWest @...,2022-03-01 20:29:53+00:00,"{'mentions': [{'start': 0, 'end': 16, 'usernam...",777225267265822720,Twitter for iPhone,Allen B. West,@,1,That is not herpes You want to see herpes then...,0.005952,0.009637,0.004996,0.002456,0.950248,0.005945,0.020767
1498757991507525633,20220301,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",RT @jdrichmond2020: 💥TEXAS💥 Get to the poles a...,2022-03-01 20:32:05+00:00,"{'mentions': [{'start': 3, 'end': 18, 'usernam...",1243278447511769088,Twitter for Android,Allen B. West,@,1,TEXAS Get to the poles and VOTE today forfor G...,0.011625,0.001941,0.007821,0.004354,0.891241,0.008727,0.074291
1498758049510703115,20220301,"{'retweet_count': 3, 'reply_count': 3, 'like_c...","Ignore the polls, vote @AllenWest @DonHuffines...",2022-03-01 20:32:19+00:00,"{'mentions': [{'start': 23, 'end': 33, 'userna...",947921064927088640,Twitter Web App,Allen B. West,@,1,"Ignore the polls, vote",0.560520,0.113108,0.029981,0.003212,0.273514,0.017124,0.002541
1498758072642281475,20220301,"{'retweet_count': 54, 'reply_count': 0, 'like_...",RT @TexasScorecard: “Why are we providing them...,2022-03-01 20:32:24+00:00,"{'mentions': [{'start': 3, 'end': 18, 'usernam...",963771619981193216,Twitter for iPhone,Allen B. West,@,1,Why are we providing them taxpayerfunded money...,0.574835,0.039536,0.021226,0.003192,0.164067,0.040670,0.156474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518828068940492800,20220426,"{'retweet_count': 1450, 'reply_count': 0, 'lik...",RT @TudorDixon: Here’s a totally random remind...,2022-04-26 05:43:24+00:00,"{'annotations': [{'start': 54, 'end': 65, 'pro...",1188233470750932992,Twitter for Android,Tudor Dixon,@,1,Heres a totally random reminder that Dem Gover...,0.041786,0.006614,0.013973,0.002772,0.669795,0.008523,0.256538
1518803894071185411,20220426,"{'retweet_count': 57, 'reply_count': 0, 'like_...",RT @TudorDixon: As long as strong leaders of g...,2022-04-26 04:07:21+00:00,"{'annotations': [{'start': 85, 'end': 91, 'pro...",1105649994525233152,Twitter for iPad,Tudor Dixon,@,1,As long as strong leaders of good conscience s...,0.028477,0.006057,0.008613,0.041882,0.895615,0.010451,0.008907
1518803174932590593,20220426,"{'retweet_count': 57, 'reply_count': 0, 'like_...",RT @TudorDixon: As long as strong leaders of g...,2022-04-26 04:04:29+00:00,"{'annotations': [{'start': 85, 'end': 91, 'pro...",530964857,Twitter Web App,Tudor Dixon,@,1,As long as strong leaders of good conscience s...,0.028477,0.006057,0.008613,0.041882,0.895615,0.010451,0.008907
1518809718717788160,20220426,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@TudorDixon @RyanAFournier @elonmusk Hey it’s ...,2022-04-26 04:30:29+00:00,"{'annotations': [{'start': 93, 'end': 99, 'pro...",961910224998219776,Twitter for iPhone,Tudor Dixon,@,1,Hey its Mark! If you cant get a new computer c...,0.033184,0.007991,0.004434,0.115446,0.120787,0.130284,0.587873


In [8]:
col = ['anger','disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
df_tweets[col].describe()

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise
count,3400094.0,3400094.0,3400094.0,3400094.0,3400094.0,3400094.0,3400094.0
mean,0.1728726,0.08094208,0.06736237,0.09637846,0.3996648,0.07886341,0.1039163
std,0.2416113,0.1598933,0.161893,0.2203612,0.3245486,0.1613449,0.1820905
min,0.0004398618,0.0001298558,0.0001509283,0.0001962055,0.0002353722,0.0006133813,0.0002194154
25%,0.014711,0.005521154,0.004406542,0.003229303,0.08506474,0.008617053,0.009379803
50%,0.04789491,0.01824804,0.01048751,0.00798551,0.3230544,0.01976202,0.03000925
75%,0.2411784,0.0765041,0.03707775,0.0407983,0.7199372,0.06079676,0.09739839
max,0.9943969,0.9918109,0.9957425,0.9944656,0.9800055,0.9936942,0.9877996


In [16]:
sus_domain_tweets = df_tweets[df_tweets.credibility==0]
sus_domain_tweets[col].describe()

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise
count,7474.0,7474.0,7474.0,7474.0,7474.0,7474.0,7474.0
mean,0.170492,0.063285,0.094863,0.057738,0.32296,0.20044,0.090221
std,0.226152,0.127676,0.177256,0.146362,0.282673,0.275053,0.155674
min,0.000878,0.00031,0.000248,0.000279,0.00076,0.001078,0.000274
25%,0.030854,0.007516,0.008728,0.003967,0.099436,0.012967,0.015929
50%,0.077136,0.016895,0.028165,0.005819,0.214994,0.046319,0.025818
75%,0.208222,0.061523,0.086715,0.021212,0.50072,0.285285,0.085035
max,0.994242,0.990815,0.992023,0.979957,0.975447,0.983222,0.984134


In [26]:
sus_user_tweets = df_tweets[df_tweets["author_id"].isin(df_sus_users.index)]
sus_user_tweets[col].describe()

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise
count,15294.0,15294.0,15294.0,15294.0,15294.0,15294.0,15294.0
mean,0.155636,0.060936,0.074317,0.134366,0.402425,0.07978,0.09254
std,0.232759,0.137372,0.171319,0.263398,0.324607,0.16199,0.166638
min,0.000564,0.000215,0.000184,0.000254,0.000369,0.000811,0.000303
25%,0.012088,0.004121,0.004364,0.003592,0.085203,0.008285,0.009019
50%,0.039038,0.012172,0.011401,0.010512,0.339827,0.019307,0.027396
75%,0.187139,0.046512,0.04454,0.09022,0.718513,0.061969,0.086192
max,0.991049,0.98915,0.993385,0.993156,0.974513,0.989717,0.983914
