In [131]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re
import nltk
import string
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertModel, BertTokenizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import svm
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
import torch
from torch import nn


In [132]:
df = pd.read_csv('bbc//train_data.csv')

In [133]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


### **Preprocessing danych**

In [134]:
nltk.download('stopwords')
nltk.download('punkt')
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Wszystko z małej litery
    text = re.sub(r'@\S+', '', text)  # Usuwanie znaczków twitterowych
    text = re.sub(r'http\S+', '', text)  # Usuwanie linków
    text = re.sub(r'pic.\S+', '', text) # Usuwanie odnośników do zdjęć
    text = re.sub(r"[^a-zA-Z+']", ' ', text)  # Usuwanie znaków nie należących do alfabetu (poza apostrofami i spacją)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text + ' ')  # Usuwanie pojedynczych znaków otoczonych spacją
    text = "".join([i for i in text if i not in string.punctuation]) # Usuwanie pozostałej interpunkcji
    words = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')  # Usuwanie stopwords
    text = " ".join([i for i in words if i not in stopwords and len(i) > 2])
    text = re.sub("\s[\s]+", " ", text).strip()  # # Usuwanie wielokrotnych spacji
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\micho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [135]:
# Usuń hashtag poniżej aby zaaplikować preprocessing
df['Cleaned_Article'] = df['Text']#.apply(clean_text)
label_encoder = LabelEncoder()
df['Category_Encoded'] = label_encoder.fit_transform(df['Category'])

In [136]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ArticleId         1490 non-null   int64 
 1   Text              1490 non-null   object
 2   Category          1490 non-null   object
 3   Cleaned_Article   1490 non-null   object
 4   Category_Encoded  1490 non-null   int32 
dtypes: int32(1), int64(1), object(3)
memory usage: 52.5+ KB


Unnamed: 0,ArticleId,Text,Category,Cleaned_Article,Category_Encoded
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex-boss launches defence lawyers defe...,0
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,0
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...,4
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in $168m payout eighteen former e...,0


In [137]:
df['Category_Encoded'].value_counts()

Category_Encoded
3    346
0    336
2    274
1    273
4    261
Name: count, dtype: int64

In [138]:
df['Category'].value_counts()

Category
sport            346
business         336
politics         274
entertainment    273
tech             261
Name: count, dtype: int64

In [139]:
df.drop(['ArticleId', 'Text', 'Category'], axis='columns')

Unnamed: 0,Cleaned_Article,Category_Encoded
0,worldcom ex-boss launches defence lawyers defe...,0
1,german business confidence slides german busin...,0
2,bbc poll indicates economic gloom citizens in ...,0
3,lifestyle governs mobile choice faster bett...,4
4,enron bosses in $168m payout eighteen former e...,0
...,...,...
1485,double eviction from big brother model caprice...,1
1486,dj double act revamp chart show dj duo jk and ...,1
1487,weak dollar hits reuters revenues at media gro...,0
1488,apple ipod family expands market apple has exp...,4


### **Uzyskanie osadzeń**

In [70]:
# Wybór tokenizatora
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#tokenizer = CustomBertTokenizer.from_pretrained('bert-base-uncased') # Aby z niego skorzystać zainicjuj najpierw nowy tokenizator 
                                                                      # (jego kod jest na samym dole pliku)
model = BertModel.from_pretrained('bert-base-uncased')

In [141]:
# Funkcja do generacji osadzeń (generuje je na raz, w BBC jest mniej danych więc trzymałem je w pamięci RAM - jeśli byłby problem skopiuj podejście z IMDb)
def get_bert_embeddings(texts, batch_size=10):
    model.eval() 
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, max_length=256, return_tensors="pt")
        ' '.join([x for x in tokens]).replace(' ##', '')
        with torch.no_grad():
            batch_embeddings = model(**tokens).last_hidden_state[:, 0, :]
        embeddings.append(batch_embeddings.cpu())
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings.numpy()

texts = df['Cleaned_Article'].tolist()
labels = df['Category_Encoded'].tolist()


Generating embeddings:   0%|          | 0/149 [00:00<?, ?it/s]

In [None]:
# Generuj osadzenia
embeddings = get_bert_embeddings(texts)

In [16]:
# Opcjonalnie zapisz lub wczytaj osadzenia
#np.save('bert_embeddings_bbc_cased.npy', embeddings)
#embeddings = np.load('bert_embeddings_bbc.npy')

### **Stworzenie modelu**

In [142]:
# Pojedynczy trening (STARE, nie uruchamiaj tego, poniżej właściwe)
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.1, random_state=42, stratify=labels)

clf = svm.SVC(kernel='linear', C=1.0, gamma='scale', class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97        34
           1       0.93      1.00      0.96        27
           2       1.00      0.85      0.92        27
           3       0.97      1.00      0.99        35
           4       0.93      0.96      0.94        26

    accuracy                           0.96       149
   macro avg       0.96      0.96      0.96       149
weighted avg       0.96      0.96      0.96       149



In [143]:
# Walidacja krzyżowa
clf = make_pipeline(StandardScaler(), svm.SVC(kernel='linear', class_weight='balanced', C=1.0, gamma='scale'))

scores = cross_val_score(clf, embeddings, labels, cv=10, scoring='f1_macro')

print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())


Cross-validation scores: [0.96631576 0.96587331 0.93129282 0.93694933 0.95690667 0.95191535
 0.98652945 0.98639016 0.95852251 0.98544719]
Average cross-validation score: 0.9626142551258505


In [144]:
# Tu robimy grid search
pipeline = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))

# Parametery jakie będzie sprawdzał grid search
param_grid = {
    'svc__C': [0.1, 1, 10],        # Regularization parameter
    'svc__kernel': ['linear', 'rbf'],  # Type of kernel
    'svc__gamma': ['scale', 'auto']    # Kernel coefficient
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1_macro', verbose=3)

grid_search.fit(embeddings, labels)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

best_clf = grid_search.best_estimator_


Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.966 total time=   0.0s
[CV 2/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.966 total time=   0.0s
[CV 3/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.931 total time=   0.0s
[CV 4/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.937 total time=   0.0s
[CV 5/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.957 total time=   0.0s
[CV 6/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.952 total time=   0.0s
[CV 7/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.987 total time=   0.0s
[CV 8/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.986 total time=   0.0s
[CV 9/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.959 total time=   0.0s
[CV 10/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0

In [145]:
# To samo co 2 komórki wyżej, ale tutaj uruchom sobie z parametrami z grid searcha
clf = make_pipeline(StandardScaler(), svm.SVC(kernel='rbf', class_weight='balanced', C=1.0, gamma='scale'))

scores = cross_val_score(clf, embeddings, labels, cv=10, scoring='f1_macro')

print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())

Cross-validation scores: [0.95858604 0.95218855 0.94995463 0.96522145 0.96563964 0.95801551
 0.98652945 0.98600436 0.9582684  0.97912458]
Average cross-validation score: 0.965953260640671


In [140]:
class CustomBertTokenizer(BertTokenizer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def tokenize(self, text):
        tokens = re.findall(r'\w+|\S', text.lower() if self.do_lower_case else text)
        new_tokens = []
        for token in tokens:
            if token in self.vocab:
                new_tokens.append(token)
            else:
                if token not in self.vocab:
                    self.add_tokens([token])
                    new_tokens.append(token)
                    weights = model.embeddings.word_embeddings.weight.data
                    new_weights = torch.cat((weights, weights[101:102]), 0)
                    new_emb = nn.Embedding.from_pretrained(new_weights, padding_idx=0, freeze=False)
                    model.embeddings.word_embeddings = new_emb
                else:
                    new_tokens.append(token)
        return new_tokens


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'CustomBertTokenizer'.
