# Project NLP Challenge

Explain what the project is for, and quickly how we are going to do it

In [82]:
# Import dataset
import pandas as pd
df = pd.read_csv("../dataset/training_data_lowercase.csv", sep='\t', header=None, names=["label", "title"], engine='python')
df.head()

Unnamed: 0,label,title
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [83]:
print(df.info())
print(df['label'].value_counts())
print(df['title'].apply(len).describe())  # length of text stats

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34152 entries, 0 to 34151
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   34152 non-null  int64 
 1   title   34152 non-null  object
dtypes: int64(1), object(1)
memory usage: 533.8+ KB
None
label
0    17572
1    16580
Name: count, dtype: int64
count    34152.000000
mean        74.680839
std         23.460862
min          2.000000
25%         61.000000
50%         70.000000
75%         85.000000
max        279.000000
Name: title, dtype: float64


In [84]:
import re

def clean_text(text):
    text = re.sub(r'[^a-z\s]', '', text)  # keep only letters and spaces
    return text

df['clean_title'] = df['title'].apply(clean_text)

In [85]:
X = df['clean_title']
y = df['label']

In [86]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [88]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      3529
           1       0.93      0.95      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831

Accuracy: 0.9446640316205533


Let's try to pre-process more and also use Bert

In [89]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text_advanced(text):
    text = re.sub(r'[^a-z\s]', '', text)  # keep letters only
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['title'].str.lower().apply(clean_text_advanced)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rospi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rospi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [90]:
from transformers import BertTokenizer, BertModel
import torch

# Load pretrained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Make sure model is in eval mode (no training here)
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [91]:
#Function to get BERT embedding for a single text
def get_bert_embedding(text):
    with torch.no_grad():  # no gradients needed
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
        outputs = bert_model(**inputs)
        # outputs.last_hidden_state shape: (batch_size=1, seq_len, hidden_size=768)
        # CLS token embedding is the first token
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

In [92]:
import numpy as np

# Apply on all titles (you can use tqdm for progress bar)
from tqdm import tqdm

X_bert = np.array([get_bert_embedding(text) for text in tqdm(df['title'])])

100%|██████████| 34152/34152 [32:31<00:00, 17.50it/s]


In [93]:
from sklearn.model_selection import train_test_split

y = df['label'].values
X_train_bert, X_test_bert, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)


In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model_bert = LogisticRegression(max_iter=1000)
model_bert.fit(X_train_bert, y_train)

y_pred_bert = model_bert.predict(X_test_bert)

print(classification_report(y_test, y_pred_bert))
print("Accuracy with BERT embeddings:", accuracy_score(y_test, y_pred_bert))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3529
           1       0.95      0.96      0.95      3302

    accuracy                           0.96      6831
   macro avg       0.96      0.96      0.96      6831
weighted avg       0.96      0.96      0.96      6831

Accuracy with BERT embeddings: 0.9559361733274777
