<a href="https://colab.research.google.com/github/KkilianJ/Thesis/blob/main/Incivility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

#with punctuation
df_long = pd.read_csv('/content/drive/MyDrive/Thesis/long_text_with_pos_text1.csv', low_memory = False)
text_long = df_long['bigram_text'].tolist()

In [None]:
import pandas as pd
df_short= pd.read_csv('/content/drive/MyDrive/Thesis/short_text_with_pos_text1.csv', low_memory = False)
text_short = df_short['bigram_text'].tolist()

In [None]:
import re
import spacy
import pandas as pd
import statistics
from multiprocessing import Pool
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")

In [None]:
uncivil_lexicon = {}
with open('/content/drive/MyDrive/Thesis/incivilities.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) == 3 and parts[1] == 'UNCIV':
            word = parts[0].lower()
            score = float(parts[2])
            uncivil_lexicon[word] = score

In [None]:
def lemma(text):
    return " ".join([token.lemma_ for token in nlp(text)])

def clean_punct(text):
    return re.sub(r'[^\w\s]', '', text)

In [None]:
def uncivil_word_sentence(sentence, uncivil_dict):
    tokens = sentence.split()
    count = sum(1 for word in tokens if word in uncivil_dict)
    score = sum(uncivil_dict.get(word, 0) for word in tokens)
    length = len(tokens)
    return (count / length if length > 0 else 0, score / length if length > 0 else 0)

def uncivil_word_tweet(tweet, uncivil_dict):
    tokens = tweet.split()
    count = sum(1 for word in tokens if word in uncivil_dict)
    score = sum(uncivil_dict.get(word, 0) for word in tokens)
    length = len(tokens)
    return (count / length if length > 0 else 0, score / length if length > 0 else 0)

def process(text):
    lem = lemma(text)
    doc = nlp(lem)
    sents = list(doc.sents)

    sentence_props = [uncivil_word_sentence(sent.text, uncivil_lexicon) for sent in sents]
    if sentence_props:
        per_sentence = sum(p[0] for p in sentence_props) / len(sentence_props)
        per_sentence_score = sum(p[1] for p in sentence_props) / len(sentence_props)
    else:
        per_sentence = per_sentence_score = 0

    cleaned = clean_punct(lem)
    per_tweet, per_tweet_score = uncivil_word_tweet(cleaned, uncivil_lexicon)

    return (per_sentence, per_sentence_score, per_tweet, per_tweet_score)


In [None]:
def process_chunk(texts, workers=42):
    with Pool(processes=workers) as pool:
        results = list(tqdm(pool.imap_unordered(process, texts), total=len(texts)))
    return results

def chunk_gogogo(df, text_column, chunk_size=50000, workers=42):
    results_all = []
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size].copy()
        texts = chunk[text_column].tolist()
        results = process_chunk(texts, workers=workers)
        chunk[['per_sentence', 'per_sentence_score', 'per_tweet', 'per_tweet_score']] = results
        results_all.append(chunk)
    final_df = pd.concat(results_all, ignore_index=True)
    return final_df

In [None]:
#final_df_long = chunk_gogogo(df_long, text_column="bigram_text", chunk_size=50000, workers=42)


100%|██████████| 50000/50000 [00:41<00:00, 1208.17it/s]
100%|██████████| 50000/50000 [00:41<00:00, 1203.57it/s]
100%|██████████| 50000/50000 [00:41<00:00, 1206.94it/s]
100%|██████████| 50000/50000 [00:41<00:00, 1205.13it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1228.75it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1225.52it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1227.75it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1227.28it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1229.89it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1230.53it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1225.26it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1229.34it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1228.66it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1227.24it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1229.55it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1230.14it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1228.08it/s]
100%|██████████| 50000/50000 [00:40<00:00, 1225.

In [None]:
#final_df_long.to_csv('/content/drive/MyDrive/Thesis/long_text_with_pos_text1.csv', index=False)
#print(final_df_long)

In [None]:
import pandas as pd
import cupy as cp
from cuml.cluster import KMeans as cuKMeans
from sklearn.preprocessing import StandardScaler

cols = ['per_sentence', 'per_sentence_score', 'per_tweet', 'per_tweet_score']
df_long = df_long.dropna(subset=cols)
X = df_long[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_gpu = cp.asarray(X_scaled)

kmeans = cuKMeans(n_clusters=2, random_state=42)
df_long['cluster'] = kmeans.fit_predict(X_gpu).get()

sampled_df = df_long.groupby('cluster').apply(lambda x: x.sample(50, random_state=42)).reset_index(drop=True)
sampled_df['id'] = ['sampled_' + str(i).zfill(4) for i in range(len(sampled_df))]
sampled_df = sampled_df[['id', 'text', 'cluster']]
sampled_shuffled = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
sampled_shuffled['text'].to_csv('/content/drive/MyDrive/Thesis/gold_label_Long.csv', index=False)

  return func(*args, **kwargs)
  sampled_df = df_long.groupby('cluster').apply(lambda x: x.sample(50, random_state=42)).reset_index(drop=True)


In [None]:
#0 is civil 1 is uncivil
gold_label1 =
  [
    1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
    1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
    1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
    1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1
  ]

#1 is civil 0 is uncivil
gold_label2 =
  [0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0
   ]

report1 = classification_report(gold_label1, cluster_labels, target_names=["Uncivil", "Civil"])
print(report1)
report1 = classification_report(gold_label2, cluster_labels, target_names=["Uncivil", "Civil"])
print(report1)

              precision    recall  f1-score   support

     Uncivil       0.76      0.70      0.73        54
       Civil       0.68      0.74      0.71        46

    accuracy                           0.72       100
   macro avg       0.72      0.72      0.72       100
weighted avg       0.72      0.72      0.72       100



In [None]:
"""
final_df_short = chunk_gogogo(df_short, text_column="bigram_text", chunk_size=50000, workers=42)
final_df_short.to_csv('/content/drive/MyDrive/Thesis/short_text_with_pos_text1.csv', index=False)
print(final_df_short)
"""

100%|██████████| 50000/50000 [02:07<00:00, 392.98it/s]
100%|██████████| 50000/50000 [02:06<00:00, 396.51it/s]
100%|██████████| 50000/50000 [02:05<00:00, 399.07it/s]
100%|██████████| 50000/50000 [02:03<00:00, 405.81it/s]
100%|██████████| 50000/50000 [02:03<00:00, 405.39it/s]
100%|██████████| 50000/50000 [02:03<00:00, 405.75it/s]
100%|██████████| 50000/50000 [02:02<00:00, 406.86it/s]
100%|██████████| 50000/50000 [02:03<00:00, 405.15it/s]
100%|██████████| 50000/50000 [02:03<00:00, 404.73it/s]
100%|██████████| 50000/50000 [02:02<00:00, 407.93it/s]
100%|██████████| 50000/50000 [02:02<00:00, 407.07it/s]
100%|██████████| 50000/50000 [02:02<00:00, 406.51it/s]
100%|██████████| 50000/50000 [02:03<00:00, 405.45it/s]
100%|██████████| 50000/50000 [02:02<00:00, 406.87it/s]
100%|██████████| 50000/50000 [02:03<00:00, 405.72it/s]
100%|██████████| 50000/50000 [02:03<00:00, 405.44it/s]
100%|██████████| 50000/50000 [02:06<00:00, 394.88it/s]
100%|██████████| 50000/50000 [02:06<00:00, 394.76it/s]
100%|█████

                                                      text  viewCount  \
0        @GOP @SheehyforMT Interesting perspective! But...        3.0   
1        @co_rapunzel4 @MSNBC @nbc @CBS @ABC @CNN I hav...       16.0   
2        @Mayor_Steinberg I want to share my blood and ...       81.0   
3        @ErrataRob @GeorgeOu Correct, the gop makes it...       36.0   
4        @BidensWins Are you going to change your Twitt...        3.0   
...                                                    ...        ...   
2999979  Lord Hannan highlights issues with the Conserv...      109.0   
2999980  @JoeBiden I love this! 🙏🏽❤️😥💙💙💙💙Thank you. Pre...        4.0   
2999981  President Joe Biden should not miss this chanc...        9.0   
2999982  @MaryDou80139756 @aurorabrshealis @VP She was ...       49.0   
2999983  @EdKrassen Did you just abandon his majesty Jo...        8.0   

         likeCount  quoteCount  replyCount  retweetCount  char_count  \
0              0.0         0.0         0.0         

**Short Text**

In [None]:
from csv import field_size_limit
import pandas as pd
import cupy as cp
from cuml.cluster import KMeans as cuKMeans
from sklearn.preprocessing import StandardScaler
file_path = '/content/drive/MyDrive/Thesis/short_text_with_pos_text1.csv'
df_final_short = pd.read_csv(file_path, low_memory= False)
cols = ['per_sentence', 'per_sentence_score', 'per_tweet', 'per_tweet_score']
df_short = df_final_short.dropna(subset=cols)
X = df_short[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_gpu = cp.asarray(X_scaled)

kmeans = cuKMeans(n_clusters=2, random_state=42)
df_short['cluster'] = kmeans.fit_predict(X_gpu).get()

sampled_df = df_short.groupby('cluster').apply(lambda x: x.sample(50, random_state=42)).reset_index(drop=True)
sampled_df['id'] = ['sampled_' + str(i).zfill(4) for i in range(len(sampled_df))]
sampled_df = sampled_df[['id', 'text', 'cluster']]
sampled_shuffled = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
sampled_shuffled['text'].to_csv('/content/drive/MyDrive/Thesis/gold_label_Short.csv', index=False)

  return func(*args, **kwargs)
  sampled_df = df_short.groupby('cluster').apply(lambda x: x.sample(50, random_state=42)).reset_index(drop=True)


In [None]:
#0 is civil 1 is uncivil
gold_label1 =  [
    0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
    0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
    1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
    0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
    0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
    1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
    0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
    0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
    1, 0, 0, 1, 0, 1, 0, 1, 1, 1
]

#1 is civil 0 is uncivil
gold_label2 = [
1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
0, 1, 1, 0, 1, 0, 1, 0, 0, 0

]

from sklearn.metrics import classification_report
cluster_labels = sampled_shuffled['cluster'].tolist()

report2 = classification_report(gold_label1, cluster_labels, target_names=["Uncivil", "Civil"])
print(report2)
report2 = classification_report(gold_label2, cluster_labels, target_names=["Uncivil", "Civil"])
print(report2)

              precision    recall  f1-score   support

     Uncivil       0.74      0.53      0.62        70
       Civil       0.34      0.57      0.42        30

    accuracy                           0.54       100
   macro avg       0.54      0.55      0.52       100
weighted avg       0.62      0.54      0.56       100

              precision    recall  f1-score   support

     Uncivil       0.26      0.43      0.33        30
       Civil       0.66      0.47      0.55        70

    accuracy                           0.46       100
   macro avg       0.46      0.45      0.44       100
weighted avg       0.54      0.46      0.48       100



# **Supervised Machine Learning Failed**
**Problem 1**: The dataset is severely imbalanced — the number of civil tweets is approximately 30 times greater than uncivil ones (around 3,000 vs. 189).

**Problem 2**: The paper claims that 6,000 rows were manually labeled. However, neither of the datasets provided (CLAPTON_augmented.csv and Twitter Deliberative Politics.csv) actually contains 6,000 labeled entries. In fact, most of the labels present appear to be incorrect. It seems that the authors may have referred to the Twitter Deliberative Politics.csv file, assuming the full dataset was hand-labeled, but this is inconsistent with what I observed.

**Problem 3**: Given the poor label quality, I find the classification results reported in the paper to be unreliable. The large number of incorrect or missing labels seriously undermines the dataset's validity for supervised machine learning tasks. Model perform super bad on their own data, consistently below 15% on classifying uncivil message.

I replicated the model training process as described in the article, but the performance was significantly below expectations. Even after incorporating weighted features for uncivil sentences and tweets, there was no meaningful improvement in the results.

I experimented with various models, including Random Forest, MLP, Bayesian and Logistic Regression. However, the precision for classifying uncivil tweets consistently remained below 15%, which I consider unacceptably low.

The research that I replicate is: https://doi.org/10.1093/joc/jqz023
According to the sentence, we operationalized incivility in terms of the daily number of uncivil tweets or the average percentage proportion of uncivil words per tweet, as predicted by different machine learning and dictionary-based lexical methods.

I used SMOTE to reduce the sample imbalanced effect on machine learning.

features that I have try before: (all below 15% on classifying uncivil tweets)
1. percentages based on tweets and text
2. percentages based on tweets and text, each sentence score and each tweets score
3. word embedded * tf-idf and per based on tweets and text or percentages based on tweets and text, each sentence score and each tweets score
4.tf-idf to vectorize tweets and percentages based on tweets and text or percentages based on tweets and text, each sentence score and each tweets score

In [None]:
final_df_long.to_csv("/content/drive/MyDrive/Thesis/long_text_with_pos_text1.csv", index=False)


In [None]:
#Data Modelling
paper_path = '/content/drive/MyDrive/Thesis/Twitter Deliberative Politics.csv'
df_paper = pd.read_csv(paper_path, encoding='ISO-8859-1',low_memory = False)

text = df_paper['message'].tolist()


#text preprocessing
import re

def preprocessing(text_list):
    cleaned = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        text = re.sub(r"#\w+", "", text)
        text = re.sub(r"@\w+", "", text)
        text = re.sub(r"<f0><u\+\d{4}><u\+\d{4}><u\+\d{4}>", "", text, flags=re.IGNORECASE)
        text = re.sub(r"<u\+\d{4}>", "", text, flags=re.IGNORECASE)
        text = re.sub(r"<f0>", "", text, flags=re.IGNORECASE)
        text = re.sub(r'^[^\w\s]+', '', text) #the punctuation comes first
        text = re.sub(r'^-+\s*', '', text)
        text = re.sub(r"\s+", " ", text).strip()
        cleaned.append(text)
    return cleaned

df_paper['message_cleaned'] = preprocessing(df_paper['message'].fillna("").astype(str).tolist())

In [None]:
final_df_paper = chunk_gogogo(df_paper, text_column="message_cleaned", chunk_size=50000, workers=42)
final_df_paper.to_csv("/content/drive/MyDrive/Thesis/Twitter Deliberative Politics.csv", index=False)


100%|██████████| 5585/5585 [00:02<00:00, 2260.65it/s]


In [None]:
print(final_df_paper)

      message_id                                            message  \
0              1  @USER- #GrahamCassidy will devastate #Military...   
1              2  @USER- The US people &amp; Minnesotans must se...   
2              4  =@USER - "we all want the same thing when you ...   
3              5  @USER - A poison in our island - Rising seas c...   
4              6  =@USER - hypocrite. You are A porn surfer and ...   
...          ...                                                ...   
5580        5984           @USER/please stand up for our democracy!   
5581        5986   @USER: $3,124,273 from the NRA during her career   
5582        5987  @USER: Glad you bribed well to cheat, lie, sch...   
5583        5988  @USER: Stand By Your Ad should be applied to o...   
5584        5989  @USER: You must call for the appointment of a ...   

      Constructiveness  Justification  Justification_internal  \
0                    0              1                       0   
1                

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

df = pd.read_csv('/content/drive/MyDrive/Thesis/Twitter Deliberative Politics.csv').dropna(subset=['Uncivil_abuse','per_sentence', 'per_sentence_score','per_tweet', 'per_tweet_score'])

X = df[['per_sentence', 'per_sentence_score', 'per_tweet', 'per_tweet_score']].values
y = df['Uncivil_abuse'].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.47795414462081126
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.47      0.61       491
           1       0.14      0.54      0.22        76

    accuracy                           0.48       567
   macro avg       0.50      0.50      0.41       567
weighted avg       0.77      0.48      0.56       567

