In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Define device for torch
use_cuda = True
print("CUDA is available:", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

CUDA is available: True


In [3]:
from datasets import load_dataset

test_dataset = load_dataset("lelexuanzz/Gossipcop_Politifact_Test")
print(test_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6900
    })
})


In [4]:
test_df = test_dataset["train"].to_pandas()
display(test_df)

Unnamed: 0,text,label
0,The media reported on “5 TikTok dances you can...,0
1,"""Over 240,000 'unverified' ballots have alread...",0
2,"Says ""Ron Johnson is making excuses for rioter...",1
3,“We have seen over the last 10 years ... under...,0
4,“I don’t get involved in the hiring and firing...,0
...,...,...
6895,Wedding Album: Dancing with the Stars Pro Lind...,0
6896,WATCH: Sneak Peek: Arizona's Furious Alex Atta...,0
6897,Mary Kay Letourneau 'Hopeful' She Can Fix Marr...,0
6898,Charlize Theron still upset Aeon Flux didn’t w...,0


### Clean Dataset

In [5]:
#drop nulls
test_df.dropna(axis=0, inplace=True)
#drop duplicates
test_df.drop_duplicates(inplace=True)
display(test_df)

Unnamed: 0,text,label
0,The media reported on “5 TikTok dances you can...,0
1,"""Over 240,000 'unverified' ballots have alread...",0
2,"Says ""Ron Johnson is making excuses for rioter...",1
3,“We have seen over the last 10 years ... under...,0
4,“I don’t get involved in the hiring and firing...,0
...,...,...
6895,Wedding Album: Dancing with the Stars Pro Lind...,0
6896,WATCH: Sneak Peek: Arizona's Furious Alex Atta...,0
6897,Mary Kay Letourneau 'Hopeful' She Can Fix Marr...,0
6898,Charlize Theron still upset Aeon Flux didn’t w...,0


### Convert to stylo features

In [6]:
import sys
import os
sys.path.append(os.path.abspath("../feat_eng"))

from feat_eng import has_quotes, has_url, percent_uppercase, frequency_punctuation, percent_whitespace, frequency_words_length, avg_sentence_length



def extract_stylometric_features(example):
    text = example['text']
    
    if text == None:
        return{
            "label": example.get("label"),
            "has_quotes": 0,
            "has_url": 0,
            "percent_uppercase": 0.0,
            "frequency_punctuation": 0,
            "percent_whitespace": 0.0,
            "frequency_words_length_15": 0,
            "frequency_words_length_14": 0,
            "frequency_words_length_12": 0,
            "frequency_words_length_11": 0,
            "avg_sentence_length": 0.0
        }
    
    return {
        "label": example.get("label"),
        "has_quotes" : has_quotes(text),
        "has_url": has_url(text),
        "percent_uppercase": percent_uppercase(text),
        "frequency_punctuation": frequency_punctuation(text),
        "percent_whitespace": percent_whitespace(text),
        "frequency_words_length_15": frequency_words_length(text, 15),
        "frequency_words_length_14": frequency_words_length(text, 14),
        "frequency_words_length_12": frequency_words_length(text, 12),
        "frequency_words_length_11": frequency_words_length(text, 11),
        "avg_sentence_length": avg_sentence_length(text)  
    }

0


In [7]:
from datasets import Dataset
#change dataset back to huggingface form


test_dataset = Dataset.from_pandas(test_df)
stylo_test = test_dataset.map(extract_stylometric_features)

# stylo_test = stylo_test.remove_columns(['title', 'roberta_embedding', 'bow_embedding', 'tfidf_embedding', 'w2v_embedding'])
print(stylo_test)


Map: 100%|██████████| 6900/6900 [00:07<00:00, 927.37 examples/s] 

Dataset({
    features: ['text', 'label', 'has_quotes', 'has_url', 'percent_uppercase', 'frequency_punctuation', 'percent_whitespace', 'frequency_words_length_15', 'frequency_words_length_14', 'frequency_words_length_12', 'frequency_words_length_11', 'avg_sentence_length'],
    num_rows: 6900
})





In [8]:
# # Push to hub
# stylo_test.push_to_hub("lelexuanzz/Gossipcop_Politifact_Test_Stylo")

## Vector Embeddings


### Roberta

### CBOW-W2V

In [18]:
from gensim.models import Word2Vec

### BOW-TFIDF 

In [None]:
#Tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf_vectorizer = TfidfVectorizer(min_df=0.05, max_df=0.95, stop_words='english', ngram_range=(1,3))
tfidf_embeddings = tfidf_vectorizer.fit_transform(test_df["text"])
print(tfidf_embeddings.shape)


(6900, 1169)


In [10]:
print(tfidf_embeddings)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 930212 stored elements and shape (6900, 1169)>
  Coords	Values
  (0, 651)	0.17095999921182603
  (0, 898)	0.08547999960591302
  (0, 488)	0.18282260236668418
  (0, 410)	0.25094353635864913
  (0, 987)	0.08752737992315925
  (0, 481)	0.12926393340734607
  (0, 884)	0.09363649008840541
  (0, 531)	0.16655547350541117
  (0, 930)	0.08363177494944153
  (0, 1093)	0.19299719617363553
  (0, 70)	0.10572111171340268
  (0, 1027)	0.2501857363995298
  (0, 397)	0.047357184997965776
  (0, 379)	0.046229405877630685
  (0, 196)	0.04711218863007459
  (0, 373)	0.2883236599478017
  (0, 813)	0.0596010690357604
  (0, 1014)	0.04779232965612915
  (0, 720)	0.09775046970425214
  (0, 43)	0.0978286176308111
  (0, 803)	0.10831420774686594
  (0, 331)	0.0956263254821396
  (0, 1083)	0.0927461442930606
  (0, 3)	0.07012585544380184
  (0, 1091)	0.10640651716020913
  :	:
  (6899, 930)	0.07211581711202476
  (6899, 1064)	0.09382929076926043
  (6899, 259)	0.212108223575

In [11]:
#use truncated svd to reduce dimensions of tfidf embeddings
#instead of PCA as tfidf embeddings are sparse

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=300)
tfidf_reduced = svd.fit_transform(tfidf_embeddings)
print(tfidf_reduced.shape)
print(tfidf_reduced)

(6900, 300)
[[ 3.22305910e-01 -2.83119499e-02  5.49363491e-01 ... -2.49494207e-02
  -1.09051313e-02 -2.97663462e-02]
 [ 3.08578306e-01  1.19417193e-01  4.53342970e-02 ... -2.91946043e-02
  -8.61015898e-03 -2.18142578e-02]
 [ 4.23304385e-01  7.86274095e-02  5.77382921e-02 ... -1.36810837e-03
  -1.18480601e-02  6.71044581e-03]
 ...
 [ 3.00126854e-01 -2.62906111e-01 -8.32331958e-02 ...  8.97199292e-03
   3.00093198e-03  1.89377183e-02]
 [ 1.85983759e-01 -2.66019185e-01 -9.32809604e-02 ...  3.36721940e-02
   1.70868644e-02 -3.90687886e-04]
 [ 1.41732095e-01 -1.63308575e-01 -6.91962397e-02 ...  7.09667882e-03
  -1.08683746e-02  2.42624694e-02]]


In [12]:
# Add a new column to the DataFrame
test_df['tfidf_embedding'] = list(tfidf_reduced)
display(test_df)

Unnamed: 0,text,label,tfidf_embedding
0,The media reported on “5 TikTok dances you can...,0,"[0.32230591024075655, -0.02831194990679292, 0...."
1,"""Over 240,000 'unverified' ballots have alread...",0,"[0.3085783060828361, 0.11941719271543562, 0.04..."
2,"Says ""Ron Johnson is making excuses for rioter...",1,"[0.4233043848185984, 0.0786274094923878, 0.057..."
3,“We have seen over the last 10 years ... under...,0,"[0.32351932184322646, 0.14350347899320803, -0...."
4,“I don’t get involved in the hiring and firing...,0,"[0.2171402326873496, -0.01335753132863769, 0.0..."
...,...,...,...
6895,Wedding Album: Dancing with the Stars Pro Lind...,0,"[0.14163919484470372, -0.18715353303969257, -0..."
6896,WATCH: Sneak Peek: Arizona's Furious Alex Atta...,0,"[0.1008928783751652, -0.15370171050340922, -0...."
6897,Mary Kay Letourneau 'Hopeful' She Can Fix Marr...,0,"[0.3001268539564991, -0.2629061111938999, -0.0..."
6898,Charlize Theron still upset Aeon Flux didn’t w...,0,"[0.18598375894520147, -0.26601918535391794, -0..."


### BOW

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(min_df=0.05, max_df=0.95, stop_words='english', ngram_range=(1,3))
bow_embeddings = bow_vectorizer.fit_transform(test_df["text"])
print(bow_embeddings.shape)

(6900, 1169)


In [14]:
print(bow_embeddings)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 930212 stored elements and shape (6900, 1169)>
  Coords	Values
  (0, 651)	2
  (0, 898)	1
  (0, 488)	2
  (0, 410)	2
  (0, 987)	1
  (0, 481)	1
  (0, 884)	1
  (0, 531)	3
  (0, 930)	2
  (0, 1093)	2
  (0, 70)	1
  (0, 1027)	3
  (0, 397)	1
  (0, 379)	1
  (0, 196)	1
  (0, 373)	4
  (0, 813)	1
  (0, 1014)	1
  (0, 720)	2
  (0, 43)	2
  (0, 803)	2
  (0, 331)	1
  (0, 1083)	1
  (0, 3)	1
  (0, 1091)	1
  :	:
  (6899, 930)	1
  (6899, 1064)	1
  (6899, 259)	1
  (6899, 696)	1
  (6899, 934)	1
  (6899, 746)	1
  (6899, 1024)	1
  (6899, 1060)	1
  (6899, 1111)	1
  (6899, 626)	1
  (6899, 673)	1
  (6899, 133)	1
  (6899, 747)	1
  (6899, 941)	1
  (6899, 258)	1
  (6899, 1156)	1
  (6899, 878)	2
  (6899, 1011)	1
  (6899, 604)	1
  (6899, 327)	1
  (6899, 946)	1
  (6899, 369)	2
  (6899, 940)	1
  (6899, 425)	1
  (6899, 1009)	1


In [15]:
bow_reduced = svd.fit_transform(bow_embeddings)
print(bow_reduced.shape)
print(bow_reduced)

(6900, 300)
[[ 4.11155369e+00 -1.40911541e+00 -2.22285019e+00 ...  1.28214744e-01
  -2.44271047e-01  9.73737607e-02]
 [ 1.53401713e+01  6.78220191e-01 -3.95217032e+00 ...  1.40646492e+00
  -6.41935732e-01 -6.98202900e-01]
 [ 1.75795111e+01 -3.37047054e+00 -6.03270616e+00 ... -7.54941418e-01
  -3.11743768e-01  9.72686176e-01]
 ...
 [ 5.58704272e+00 -3.01891274e+00 -6.96924621e-01 ... -5.68685979e-01
   9.97549734e-02  4.58639672e-01]
 [ 1.83482125e+00 -2.50211370e+00  4.54292637e-01 ...  6.76543607e-01
   3.78778377e-02 -2.05565256e-01]
 [ 1.03713558e+00 -8.87831675e-01 -9.31530777e-02 ...  3.77319279e-02
   1.64423922e-02  2.49000805e-01]]


In [16]:
# Add a new column to the DataFrame
test_df['bow_embedding'] = list(bow_reduced)
display(test_df)

Unnamed: 0,text,label,tfidf_embedding,bow_embedding
0,The media reported on “5 TikTok dances you can...,0,"[0.32230591024075655, -0.02831194990679292, 0....","[4.111553694916507, -1.4091154149981984, -2.22..."
1,"""Over 240,000 'unverified' ballots have alread...",0,"[0.3085783060828361, 0.11941719271543562, 0.04...","[15.340171294779969, 0.6782201912917837, -3.95..."
2,"Says ""Ron Johnson is making excuses for rioter...",1,"[0.4233043848185984, 0.0786274094923878, 0.057...","[17.57951108926488, -3.370470538010633, -6.032..."
3,“We have seen over the last 10 years ... under...,0,"[0.32351932184322646, 0.14350347899320803, -0....","[13.623137823124164, 2.594215458864157, 1.1975..."
4,“I don’t get involved in the hiring and firing...,0,"[0.2171402326873496, -0.01335753132863769, 0.0...","[11.715673205868887, -3.8977639951846776, -0.5..."
...,...,...,...,...
6895,Wedding Album: Dancing with the Stars Pro Lind...,0,"[0.14163919484470372, -0.18715353303969257, -0...","[0.5081918596591153, -0.47701325989197113, 0.0..."
6896,WATCH: Sneak Peek: Arizona's Furious Alex Atta...,0,"[0.1008928783751652, -0.15370171050340922, -0....","[0.3445347175467999, -0.4934581721987002, 0.03..."
6897,Mary Kay Letourneau 'Hopeful' She Can Fix Marr...,0,"[0.3001268539564991, -0.2629061111938999, -0.0...","[5.5870427230749256, -3.0189127373522684, -0.6..."
6898,Charlize Theron still upset Aeon Flux didn’t w...,0,"[0.18598375894520147, -0.26601918535391794, -0...","[1.8348212505816988, -2.5021137033782117, 0.45..."
