In [59]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import torch

from transformers import BertModel, BertTokenizer

In [49]:
import pandas as pd
from machine_learning.readability_scorer import ReadabilityScorer
from machine_learning.sentiment_model import SentimentModel
from machine_learning.model import FakeNewsClassifier
import datasets

from config import base_columns

In [7]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
valid = pd.read_csv("data/valid.csv")

In [23]:
dataset = "chengxuphd/liar2"
dataset = datasets.load_dataset(dataset)
train = pd.DataFrame(dataset["train"])
val= pd.DataFrame(dataset["validation"])
test = pd.DataFrame(dataset["test"])

In [25]:
readability_scorer = ReadabilityScorer()
# Apply function to the DataFrame and expand dictionary into new columns
df_features = train["statement"].apply(readability_scorer.analyze_text_complexity).apply(pd.Series)
# Merge new features into original DataFrame
train = pd.concat([train, df_features], axis=1)

In [26]:
train.head()

Unnamed: 0,id,label,statement,date,subject,speaker,speaker_description,state_info,true_counts,mostly_true_counts,...,false_counts,pants_on_fire_counts,context,justification,Lexical Diversity (TTR),Average Word Length,Avg Syllables per Word,Difficult Word Ratio,Dependency Depth,Length
0,13847,5,"90 percent of Americans ""support universal bac...","October 2, 2017",government regulation;polls and public opinion...,chris abele,"Chris Abele is Milwaukee County Executive, a p...",wisconsin,1,4,...,5,2,a tweet,"""Universal"" is the term for background checks ...",1.0,6.5,2.1,0.3,3.0,10.0
1,13411,1,Last year was one of the deadliest years ever ...,"May 19, 2017",after the fact;congress;criminal justice;histo...,thom tillis,Thom Tillis is a Republican who serves as U.S....,north carolina,0,2,...,2,0,a press release supporting the Back The Blue A...,"Sen. Thom Tillis, a North Carolina Republican,...",1.0,4.77,1.54,0.23,7.0,13.0
2,10882,0,"Bernie Sanders's plan is ""to raise your taxes ...","October 28, 2015",taxes,chris christie,"Chris Christie announced June 6, 2023 that he ...",national,21,20,...,17,8,"Boulder, Colo","Christie said that Sanders’s plan is ""to raise...",0.9,4.4,1.4,0.2,4.0,10.0
3,20697,4,Voter ID is supported by an overwhelming major...,"December 8, 2021",voter id laws,lee zeldin,Lee Zeldin is a Republican representing New Yo...,new york,1,2,...,0,0,a Tweet,Zeldin claimed voter identification requiremen...,0.95,4.85,1.7,0.2,6.0,20.0
4,6095,2,"Says Barack Obama ""robbed Medicare (of) $716 b...","August 12, 2012",federal budget;history;medicare;retirement,mitt romney,Mitt Romney is a U.S. senator from Utah. He ra...,national,31,33,...,32,19,"an interview on ""60 Minutes""","Romney said, ""There's only one president that ...",1.0,5.0,1.82,0.55,6.0,11.0


In [27]:
sentiment_model = SentimentModel()
statements = train["statement"].tolist()
train_sentiments = sentiment_model.generate(statements)

Device set to use cpu


In [29]:
train["sentiment"] = train_sentiments

In [32]:
numerical_cols = ["Lexical Diversity (TTR)", "Average Word Length", "Avg Syllables per Word", "Difficult Word Ratio", "Dependency Depth", "Length", "sentiment"]
categorical_cols = []

In [60]:
tabular_data_length = 7

In [61]:
model = FakeNewsClassifier(tabular_data_length)

In [56]:
def preprocessing(df, numerical_features: list, categorical_features: list):
    
    preprocessor = ColumnTransformer([
        ("num", StandardScaler(), numerical_features),  
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)  
    ])

    # Apply transformations
    processed_features = preprocessor.fit_transform(df)
    
    tabular_tensor = torch.tensor(processed_features, dtype=torch.float32)
    return tabular_tensor

In [57]:
tab_data = preprocessing(train, numerical_cols, categorical_cols)

In [58]:
tab_data

tensor([[ 0.7759,  2.3602,  1.8499,  ..., -1.2234, -0.9253, -0.2754],
        [ 0.7759, -0.2250, -0.3840,  ...,  0.9456, -0.5345, -0.1704],
        [-0.6089, -0.7779, -0.9425,  ..., -0.6812, -0.9253,  0.1229],
        ...,
        [-0.2904, -0.4044, -0.0649,  ...,  3.1145,  1.1590, -2.0132],
        [ 0.7759,  0.5221, -0.4239,  ...,  0.4033, -0.2740, -0.3726],
        [ 0.7759, -0.2549, -1.0223,  ..., -0.6812, -1.1859,  0.5006]])

In [None]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_inputs = tokenizer(statements, padding=True, truncation=True, max_length=512, return_tensors="pt")

tabular_features = tab_data

# Model initialization
model = FakeNewsClassifier(num_tabular_features=10)

# Forward pass
with torch.no_grad():
    output = model(encoded_inputs["input_ids"], encoded_inputs["attention_mask"], tabular_features)

print(output)  # Probabilities for fake news detection