<a href="https://colab.research.google.com/github/KkilianJ/Thesis/blob/main/Trigram_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import pandas as pd
long_text = '/content/drive/MyDrive/Thesis/long_text_with_pos_text.csv'
short_text = '/content/drive/MyDrive/Thesis/short_text_with_pos_text.csv'
df_long = pd.read_csv(long_text, low_memory=False)
df_short = pd.read_csv(short_text, low_memory = False)

                                                 text  viewCount  likeCount  \
0   @CynCity1243 @MikeRoy_Cit12 @GastricpH Likewis...       49.0        0.0   
1   @Megatron_ron France should take in Lebanese r...       84.0        1.0   
2   @TiltingNBA @majornija @nicole_naiya Heard thi...        4.0        0.0   
3   Guys.\n\nDonald Trump is quite literally the l...      159.0        6.0   
4   @realDonaldTrump The American people have endu...     2274.0       74.0   
5   @legitbrittFLA @TrumpDailyPosts Yes, the USA a...       19.0        1.0   
6   @2Strong2Silence Angry Staffer said the other ...       33.0        2.0   
7   Biden's Numbers: April 2024 Update, the S&amp;...      152.0        0.0   
8   @elonmusk MN Governor Tim Walz News...\n\nAs D...        3.0        0.0   
9   @RealChadEros @brianlilley @joe_warmington @Ju...       19.0        0.0   
10  @GenFlynn @patriottvnet @FlynnMovie @boonecutl...        5.0        0.0   
11  From MJTruth…\n\n• The newest hoax against Tru..

In [12]:
import re
from collections import Counter
from multiprocessing import Pool
from math import sqrt
from tqdm import tqdm

long_tweets = df_long['bigram_text'].tolist()
short_tweets = df_short['bigram_text'].tolist()

name_tokens = {"donald","trump","joe","biden","kamala","harris","j.d.","vance","tim","walz","robert","kennedy"}

def extract_trigrams(texts):
    counts = Counter()
    for text in texts:
        words = text.split()
        for i in range(len(words) - 2):
            tri = (words[i], words[i+1], words[i+2])
            if "amp" in tri or any(token in name_tokens for token in tri):
                continue
            counts[" ".join(tri)] += 1
    return counts

def count_trigrams(texts, processes=44, chunk_size=100_000):
    chunks = [texts[i:i+chunk_size] for i in range(0, len(texts), chunk_size)]
    summary = Counter()
    with Pool(processes) as pool:
        for part in tqdm(pool.imap(extract_trigrams, chunks), total=len(chunks), desc="Counting trigrams"):
            summary.update(part)
    return summary

def compute_t_scores(short_counts, long_counts, min_freq=50):
    Ns = sum(short_counts.values())
    Nl = sum(long_counts.values())
    records = []
    all_tris = set(short_counts) | set(long_counts)
    for tri in tqdm(all_tris, desc="Scoring trigrams"):
        fs = short_counts.get(tri, 0)
        fl = long_counts.get(tri, 0)
        if fs + fl < min_freq:
            continue
        ps, pl = fs / Ns, fl / Nl
        var = ps / Ns + pl / Nl
        if var <= 0:
            continue
        t = (pl - ps) / sqrt(var)
        if t >= 2:
            cat = "increase"
        elif t <= -2:
            cat = "decrease"
        else:
            cat = "neutral"
        records.append((tri, t, fs, fl, cat))
    return pd.DataFrame(records, columns=['trigram','t_score','freq_short','freq_long','category'])

short_counts = count_trigrams(short_tweets)
long_counts = count_trigrams(long_tweets)
df_results = compute_t_scores(short_counts, long_counts)

df_increase = df_results[df_results.category == 'increase']
df_decrease = df_results[df_results.category == 'decrease']
df_neutral  = df_results[df_results.category == 'neutral']



Counting trigrams: 100%|██████████| 30/30 [00:30<00:00,  1.03s/it]
Counting trigrams: 100%|██████████| 30/30 [01:23<00:00,  2.79s/it]
Scoring trigrams: 100%|██████████| 57131841/57131841 [00:59<00:00, 963200.00it/s] 


                    trigram    t_score  freq_short  freq_long  category
2            started no new   3.327142          16         84  increase
5            people who put   2.138410          40        140  increase
8            no interest in   2.752489         305        893  increase
15               i use that   4.335906          10         75  increase
34            opened up the   3.058376          80        281  increase
35             have to like   2.311672         141        429  increase
59              and plot to  11.895855           1        152  increase
74         us citizens have   5.465583          21        141  increase
87              in a recent   3.859284         168        566  increase
97            isnt just the   3.615472          41        175  increase
108            but when you   4.293915         196        666  increase
113      most successful in   2.252051          10         48  increase
114        in politics they   2.421957          10         50  i

In [15]:
print("Bigram Increase")
print(df_increase.sort_values("t_score", ascending=False).head(50))

print("Bigram Decrease")
print(df_decrease.sort_values("t_score").head(50))

print("Bigram Neutral")
print(df_neutral.sort_values("t_score", ascending=False).head(50))



Bigram Increase
                           trigram    t_score  freq_short  freq_long  category
51335       delivered the opposite  38.640693           1       1504  increase
192706         delivered peace and  38.601855           1       1501  increase
256484         place everything is  38.601855           1       1501  increase
59364         the place everything  38.588901           1       1500  increase
102084                 now 30 more  38.588901           1       1500  increase
124998              took office 15  38.550011           1       1497  increase
223330        when bidenharis took  38.537039           1       1496  increase
170555      bidenharis took office  38.537039           1       1496  increase
219159         was when bidenharis  38.537039           1       1496  increase
233525         and prosperity when  38.526623           2       1506  increase
204486          immigrants some of  38.524063           1       1495  increase
144308           office 15 million  