<a href="https://colab.research.google.com/github/KkilianJ/Thesis/blob/main/Postag_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
long_text = '/content/drive/MyDrive/Thesis/long_text_with_pos_text1.csv'
short_text = '/content/drive/MyDrive/Thesis/short_text_with_pos_text1.csv'
df_long = pd.read_csv(long_text, low_memory=False)
df_short = pd.read_csv(short_text, low_memory = False)

In [9]:
import pandas as pd
import spacy
from collections import Counter
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat", "lemmatizer"])
nlp.add_pipe("sentencizer")

df_short = pd.read_csv('/content/drive/MyDrive/Thesis/short_text_with_pos_text1.csv')
df_long  = pd.read_csv('/content/drive/MyDrive/Thesis/long_text_with_pos_text1.csv')

short_texts = df_short['bigram_text'].astype(str).tolist()
long_texts  = df_long['bigram_text'].astype(str).tolist()

def extract_pos_sequences(texts):
    seqs = []
    for doc in tqdm(nlp.pipe(texts, batch_size=1000, n_process=42), total=len(texts)):
        for sent in doc.sents:
            tags = [tok.pos_ for tok in sent if not tok.is_punct and tok.pos_ != "SPACE"]
            if tags:
                seqs.append(" ".join(tags))
    return seqs

short_seqs = extract_pos_sequences(short_texts)
long_seqs  = extract_pos_sequences(long_texts)

c_short = Counter(short_seqs)
c_long  = Counter(long_seqs)
n_short = sum(c_short.values())
n_long  = sum(c_long.values())

rows = []
for seq in set(c_short) | set(c_long):
    f_s = c_short.get(seq, 0)
    f_l = c_long.get(seq, 0)
    p_s = f_s / n_short * 100
    p_l = f_l / n_long * 100
    diff = p_l - p_s
    rows.append((seq, f_s, f_l, round(p_s, 3), round(p_l, 3), round(diff, 3)))

df = pd.DataFrame(rows, columns=["pos_sequence", "freq_short", "freq_long", "short_pct", "long_pct", "diff_pct"])
df_increase = df[df["diff_pct"] > 1].sort_values("diff_pct", ascending=False)
df_decrease = df[df["diff_pct"] < -1].sort_values("diff_pct")
df_neutral  = df[df["diff_pct"].abs() <= 1].sort_values("diff_pct", ascending=False)

print("- Increase POS Sequences -")
print(df_increase[["pos_sequence", "freq_short", "freq_long", "short_pct", "long_pct", "diff_pct"]].head(30).to_string(index=False))

print("- Decrease POS Sequences -")
print(df_decrease[["pos_sequence", "freq_short", "freq_long", "short_pct", "long_pct", "diff_pct"]].head(30).to_string(index=False))

print("- Neutral POS Sequences -")
print(df_neutral[["pos_sequence", "freq_short", "freq_long", "short_pct", "long_pct", "diff_pct"]].head(30).to_string(index=False))


100%|██████████| 2999984/2999984 [18:50<00:00, 2654.84it/s]
100%|██████████| 2999988/2999988 [23:02<00:00, 2169.60it/s]


- Increase POS Sequences -
Empty DataFrame
Columns: [pos_sequence, freq_short, freq_long, short_pct, long_pct, diff_pct]
Index: []
- Decrease POS Sequences -
Empty DataFrame
Columns: [pos_sequence, freq_short, freq_long, short_pct, long_pct, diff_pct]
Index: []
- Neutral POS Sequences -
                                                       pos_sequence  freq_short  freq_long  short_pct  long_pct  diff_pct
                                                               PRON        7246      18223      0.127     0.177     0.050
                                                              SCONJ        3941       9899      0.069     0.096     0.027
                                                                NUM        4430      10562      0.078     0.102     0.025
                                                           PRON AUX        2299       6347      0.040     0.061     0.021
                                                                  X        3321       7772      0.058 

In [31]:
df_increase = df[df.diff_pct > 0.005].sort_values("diff_pct", ascending=False)
df_decrease = df[df.diff_pct < -0.005].sort_values("diff_pct")
df_neutral  = df[df.diff_pct.abs() <= 0.005].sort_values("diff_pct", ascending=False)


In [32]:

print("- Increase POS Sequences -")
print(df_increase[["pos_sequence", "freq_short", "freq_long", "short_pct", "long_pct", "diff_pct"]].head(30).to_string(index=False))

print("- Decrease POS Sequences -")
print(df_decrease[["pos_sequence", "freq_short", "freq_long", "short_pct", "long_pct", "diff_pct"]].head(30).to_string(index=False))

print("- Neutral POS Sequences -")
print(df_neutral[["pos_sequence", "freq_short", "freq_long", "short_pct", "long_pct", "diff_pct"]].head(30).to_string(index=False))


- Increase POS Sequences -
                                                                                                                                       pos_sequence  freq_short  freq_long  short_pct  long_pct  diff_pct
                                                                                                                                               PRON        7246      18223      0.127     0.177     0.050
                                                                                                                                              SCONJ        3941       9899      0.069     0.096     0.027
                                                                                                                                                NUM        4430      10562      0.078     0.102     0.025
                                                                                                                                           PRON AUX        2299      

In [16]:
from collections import Counter

top_short = Counter(short_seqs).most_common(30)
top_long = Counter(long_seqs).most_common(30)

print("Top 10 POS sequences in SHORT tweets:")
for seq, freq in top_short:
    print(f"{seq} : {freq}")

print("\nTop 10 POS sequences in LONG tweets:")
for seq, freq in top_long:
    print(f"{seq} : {freq}")


Top 10 POS sequences in SHORT tweets:
NOUN : 47632
INTJ : 45799
PROPN : 43580
ADJ : 28992
ADJ NOUN : 28746
PROPN PROPN : 20956
ADV : 20147
VERB : 19515
PRON AUX ADJ : 16414
VERB PRON : 15711
PRON VERB : 11946
DET NOUN : 10581
PRON VERB DET NOUN : 10487
PROPN NOUN : 10234
PRON VERB PRON : 9700
NOUN NOUN : 9694
PRON AUX DET NOUN : 8690
VERB DET NOUN : 8437
ADV ADJ : 8174
PRON AUX VERB : 7970
PRON VERB ADJ : 7549
PROPN PROPN PROPN : 7341
VERB NOUN : 7261
PRON : 7246
ADP : 6905
PRON VERB NOUN : 6345
PRON AUX DET ADJ NOUN : 6207
PRON AUX ADV ADJ : 6207
INTJ INTJ : 5918
ADJ NOUN NOUN : 5596

Top 10 POS sequences in LONG tweets:
INTJ : 58268
NOUN : 56855
PROPN : 51708
ADJ : 32391
ADJ NOUN : 29538
ADV : 28365
PRON AUX ADJ : 23666
VERB : 22958
PRON VERB : 20672
PROPN PROPN : 19846
PRON : 18223
VERB PRON : 17955
PRON VERB DET NOUN : 17575
DET NOUN : 17340
PRON VERB PRON : 15623
PRON AUX DET NOUN : 13901
PRON AUX VERB : 12823
NOUN NOUN : 12744
VERB DET NOUN : 10681
PRON VERB ADJ : 10626
NUM : 105

In [13]:
print("Total POS sequences in short:", len(short_seqs))
print("Total POS sequences in long:", len(long_seqs))


Total POS sequences in short: 5705072
Total POS sequences in long: 10322264
