In [1]:
import pandas as pd
satire_data = pd.read_csv('/kaggle/input/july29thcapstonedataset/satire_nsp_dataset.csv')
non_satire_data = pd.read_csv('/kaggle/input/july29thcapstonedataset/TruenewsNSP.csv')
satire_data = satire_data.rename(columns={'Headline': 'text'})
non_satire_data = non_satire_data.rename(columns={'clean_title': 'text'})
satire_data['label'] = 1  # All texts in satire_data are satire
non_satire_data['label'] = 0  # All texts in non_satire_data are non-satire

# Combine the datasets
combined_data = pd.concat([satire_data, non_satire_data])
df=combined_data

In [2]:
def split_text_by_words(text):
    words = text.split()
    mid = len(words) // 2
    
    first_part = ' '.join(words[:mid])
    second_part = ' '.join(words[mid:])
    
    return first_part, second_part

df[['Part1', 'second_part']] = df['text'].apply(lambda x: pd.Series(split_text_by_words(x)))

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Sl No.,text,Humor,Mechanism,Article link,Image link,labels,texts,faces,...,crop_hints,Headline_Response,Combined_Entities_Response,First Part,Split_Headline_Responses,label,image_url,6_way_label,Part1,second_part
0,0,0.0,John Leguizamo's Boycott Of Mario Movie Leads ...,1.0,GP,https://babylonbee.com/news/john-leguizamos-bo...,https://media.babylonbee.com/articles/6439bbc0...,"Hairstyle, Human, Dress shirt, Beard, Sleeve, ...",,2.0,...,"[x: 69\n, x: 1117\n, x: 1117\ny: 787\n, x: 69\...",Searches.\n\nThis caption suggests that the mo...,**Related Keywords:**\n\n- Stock photography\n...,John Leguizamo's Boycott Of Mario Movie Leads To,Striking Portraits of The Super Mario Bros. Mo...,1,,,John Leguizamo's Boycott Of Mario Movie Leads To,Sharp Rise In People Googling 'Who Is John Leg...
1,1,1.0,Brave Adventurer Discovers Long-Lost Article H...,1.0,RL,https://babylonbee.com/news/brave-adventurer-d...,https://media.babylonbee.com/articles/6439c1d1...,"Hat, Table, Personal computer, Tableware, Comp...",,1.0,...,"[x: 278\n, x: 719\n, x: 719\ny: 787\n, x: 278\...",", and Such.\n\nThis caption is from a satirica...",.\n\nThese terms are related to the concept of...,Brave Adventurer Discovers Long-Lost Article H...,in the Archives.,1,,,Brave Adventurer Discovers Long-Lost Article H...,"Beneath Labyrinth Of Ads, Pop-Ups, Privacy Pol..."
2,2,2.0,Drunk Irishmen Say They Understood Biden's Dub...,1.0,RL,https://babylonbee.com/news/drunk-irishmen-ann...,https://media.babylonbee.com/articles/6439b3f3...,"Outerwear, Smile, Coat, Tie, Gesture, Chair, S...",,4.0,...,"[, x: 788\n, x: 788\ny: 787\n, y: 787\n]",This caption suggests that drunken Irishmen mi...,", Legal proceedings.\n\nThese terms are relate...",Drunk Irishmen Say They,'re Not Responsible for,1,,,Drunk Irishmen Say They,Understood Biden's Dublin Speech Perfectly
3,3,3.0,Report: Crypto Is A Scam! UPDATE: We Were Wron...,1.0,GP,https://babylonbee.com/news/report-crypto-is-a...,https://media.babylonbee.com/articles/6439a063...,"Finger, Gesture, Font, Thumb, Gadget, Electric...",Vol 31.786\nOrder Book\nBid\n40.487\nAsk\n349....,0.0,...,"[x: 348\n, x: 1136\n, x: 1136\ny: 787\n, x: 34...",...\n\nThe caption is a series of updates abou...,exchange.\n\n**Related Keywords:**\n\n- Blockc...,Report: Crypto Is A Scam! UPDATE: We Were Wron...,Crypto Is A Scam! UPDATE: Crypto Is A Scam!\n\...,1,,,Report: Crypto Is A Scam! UPDATE: We Were Wron...,"Oops, Crypto Is A Scam! UPDATE: Well, Maybe It..."
4,4,4.0,Sports Illustrated Puts Bikini On Walrus For L...,1.0,CR,https://babylonbee.com/news/sports-illustrated...,https://media.babylonbee.com/articles/64399d6c...,"Working animal, Dog, Carnivore, Terrestrial an...","Sports\nIllustrated\nSwimsuit 2023, Sports, Il...",0.0,...,"[x: 417\n, x: 858\n, x: 858\ny: 787\n, x: 417\...",This caption is satirical because it mocks the...,", Sports Illustrated photographer.\n\n**Relate...",Sports Illustrated Puts Bikini On,Polar Bears to Highlight Their Gentle,1,,,Sports Illustrated Puts Bikini On,Walrus For Latest Body-Positive Swimsuit Edition


In [4]:
df['Split_Headline_Responses']=str(df['Split_Headline_Responses'])

In [5]:
df = df.sample(frac=1, random_state=2)

In [6]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

# Load the USE model
use_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

# Function to get USE embedding
def get_use_embedding(texts):
    return use_embed(texts).numpy()

# Function to get BERT embedding
def get_bert_embedding(texts):
    inputs = bert_tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def calculate_similarity(emb1, emb2):
    return 1 - cosine(emb1, emb2)

# Function to process data in batches
def process_data_in_batches(df, batch_size=32):
    features = []
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[i:i+batch_size]
        
        use_emb1 = get_use_embedding(batch['second_part'].tolist())
        use_emb2 = get_use_embedding(batch['Split_Headline_Responses'].tolist())
        bert_emb1 = get_bert_embedding(batch['second_part'].tolist())
        bert_emb2 = get_bert_embedding(batch['Split_Headline_Responses'].tolist())
        
        use_similarities = np.array([calculate_similarity(e1, e2) for e1, e2 in zip(use_emb1, use_emb2)])
        bert_similarities = np.array([calculate_similarity(e1, e2) for e1, e2 in zip(bert_emb1, bert_emb2)])
        
        batch_features = np.concatenate([use_emb1, use_emb2, bert_emb1, bert_emb2, 
                                         use_similarities.reshape(-1, 1), 
                                         bert_similarities.reshape(-1, 1)], axis=1)
        features.append(batch_features)
    
    return np.concatenate(features, axis=0)


# Process data in batches
print("Processing data in batches...")
X = process_data_in_batches(df)
y = df['label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)



2024-08-03 12:52:05.369226: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-03 12:52:05.369352: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-03 12:52:05.511719: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing data in batches...


100%|██████████| 684/684 [1:10:58<00:00,  6.23s/it]


In [7]:
# Train a logistic regression model
print("Training logistic regression model...")
model = LogisticRegression(max_iter=1000)  # Increase max_iter if the model doesn't converge
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training logistic regression model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      2200
           1       0.85      0.85      0.85      2178

    accuracy                           0.85      4378
   macro avg       0.85      0.85      0.85      4378
weighted avg       0.85      0.85      0.85      4378



In [8]:
from sklearn.neighbors import KNeighborsClassifier

# Train a KNN model
print("Training KNN model...")
model = KNeighborsClassifier(n_neighbors=15)  # Adjust n_neighbors as needed
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training KNN model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      2200
           1       0.80      0.82      0.81      2178

    accuracy                           0.80      4378
   macro avg       0.80      0.80      0.80      4378
weighted avg       0.80      0.80      0.80      4378



In [9]:
from sklearn.svm import SVC

# Train an SVM model
print("Training SVM model...")
model = SVC(kernel='rbf')  # You can adjust kernel and other parameters
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training SVM model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      2200
           1       0.83      0.85      0.84      2178

    accuracy                           0.84      4378
   macro avg       0.84      0.84      0.84      4378
weighted avg       0.84      0.84      0.84      4378



In [10]:
from sklearn.tree import DecisionTreeClassifier

# Train a decision tree model
print("Training Decision Tree model...")
model = DecisionTreeClassifier(max_depth=20)  # Adjust max_depth as needed
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training Decision Tree model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      2200
           1       0.68      0.68      0.68      2178

    accuracy                           0.68      4378
   macro avg       0.68      0.68      0.68      4378
weighted avg       0.68      0.68      0.68      4378



In [11]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest model
print("Training Random Forest model...")
model = RandomForestClassifier(n_estimators=200)  # Adjust n_estimators as needed
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training Random Forest model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.76      0.79      2200
           1       0.78      0.83      0.80      2178

    accuracy                           0.79      4378
   macro avg       0.80      0.79      0.79      4378
weighted avg       0.80      0.79      0.79      4378



In [12]:
from sklearn.ensemble import GradientBoostingClassifier

# Train a gradient boosting model
print("Training Gradient Boosting model...")
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1)  # Adjust parameters as needed
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training Gradient Boosting model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.81      0.82      2200
           1       0.81      0.84      0.83      2178

    accuracy                           0.82      4378
   macro avg       0.82      0.82      0.82      4378
weighted avg       0.82      0.82      0.82      4378



In [13]:
from sklearn.neural_network import MLPClassifier

# Train an MLP model
print("Training MLP model...")
model = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', max_iter=200)  # Adjust parameters as needed
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training MLP model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      2200
           1       0.85      0.84      0.84      2178

    accuracy                           0.84      4378
   macro avg       0.84      0.84      0.84      4378
weighted avg       0.84      0.84      0.84      4378



In [14]:
import xgboost as xgb
from sklearn.metrics import classification_report

# Train an XGBoost model
print("Training XGBoost model...")
model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss')  # Adjust n_estimators as needed
model.fit(X_train, y_train)

# Make predictions
print("Making predictions...")
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Training XGBoost model...
Making predictions...
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      2200
           1       0.81      0.84      0.82      2178

    accuracy                           0.82      4378
   macro avg       0.82      0.82      0.82      4378
weighted avg       0.82      0.82      0.82      4378

