In [49]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re
import nltk
import string
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertModel, BertTokenizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import svm
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report
import torch


In [38]:
df = pd.read_csv('imdb//train_data.txt', delimiter=" ::: ", header=None, names=["ID", "Title", "Genre", "Description"], engine='python')

In [39]:
df.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


### **Preprocessing danych**

In [40]:
nltk.download('stopwords')
nltk.download('punkt')
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Wszystko z małej litery
    text = re.sub(r'@\S+', '', text)  # Usuwanie znaczków twitterowych
    text = re.sub(r'http\S+', '', text)  # Usuwanie linków
    text = re.sub(r'pic.\S+', '', text) # Usuwanie odnośników do zdjęć
    text = re.sub(r"[^a-zA-Z+']", ' ', text)  # Usuwanie znaków nie należących do alfabetu (poza apostrofami i spacją)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text + ' ')  # Usuwanie pojedynczych znaków otoczonych spacją
    text = "".join([i for i in text if i not in string.punctuation]) # Usuwanie pozostałej interpunkcji
    words = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')  # Usuwanie stopwords
    text = " ".join([i for i in words if i not in stopwords and len(i) > 2])
    text = re.sub("\s[\s]+", " ", text).strip()  # # Usuwanie wielokrotnych spacji
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\micho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
# Usuń hashtag poniżej aby zaaplikować preprocessing
df['Cleaned_Description'] = df['Description']#.apply(clean_text)
label_encoder = LabelEncoder()
df['Genre_Encoded'] = label_encoder.fit_transform(df['Genre'])

In [42]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   54214 non-null  int64 
 1   Title                54214 non-null  object
 2   Genre                54214 non-null  object
 3   Description          54214 non-null  object
 4   Cleaned_Description  54214 non-null  object
 5   Genre_Encoded        54214 non-null  int32 
dtypes: int32(1), int64(1), object(4)
memory usage: 2.3+ MB


Unnamed: 0,ID,Title,Genre,Description,Cleaned_Description,Genre_Encoded
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...,Listening in to a conversation between his doc...,8
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...,A brother and sister with a past incestuous re...,24
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...,As the bus empties the students for their fiel...,1
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...,To help their unemployed father make ends meet...,8
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...,The film's title refers not only to the un-rec...,8


In [43]:
df['Genre_Encoded'].value_counts()

Genre_Encoded
8     13613
7     13096
5      7447
21     5073
13     2204
24     1591
0      1315
26     1032
18      884
9       784
2       775
14      731
19      672
20      647
1       590
6       505
3       498
22      432
23      391
10      323
16      319
15      277
4       265
12      243
11      194
17      181
25      132
Name: count, dtype: int64

In [44]:
df.drop(['ID', 'Title', 'Genre', 'Description'], axis='columns')

Unnamed: 0,Cleaned_Description,Genre_Encoded
0,Listening in to a conversation between his doc...,8
1,A brother and sister with a past incestuous re...,24
2,As the bus empties the students for their fiel...,1
3,To help their unemployed father make ends meet...,8
4,The film's title refers not only to the un-rec...,8
...,...,...
54209,This short-lived NBC live sitcom centered on B...,5
54210,The NEXT Generation of EXPLOITATION. The siste...,13
54211,"Ze bestaan echt, is a stand-up comedy about gr...",7
54212,Walter and Vivian live in the country and have...,5


### **Uzyskanie osadzeń**

In [45]:
# Wybór tokenizatora
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#tokenizer = CustomBertTokenizer.from_pretrained('bert-base-uncased') # Aby z niego skorzystać zainicjuj najpierw nowy tokenizator 
                                                                      # (jego kod jest na samym dole pliku)
model = BertModel.from_pretrained('bert-base-uncased')

In [47]:
# Funkcja generująca i zapisująca dane do pliku w sposób iteracyjny (oszczędzamy RAM)
# UWAGA: jeśli generujesz osadzenia więcej niż raz zmień nazwę albo usuń stary plik, w przeciwnym razie nie dojdzie do nadpisania tylko do poszerzenia istniejącego pliku.
def get_bert_embeddings(texts, batch_size=10, save_path='bert_embeddings_imdb_noprep.npy'):
    model.eval()
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, max_length=256, return_tensors="pt")
        with torch.no_grad():
            batch_embeddings = model(**tokens).last_hidden_state[:, 0, :]
        batch_embeddings = batch_embeddings.cpu().numpy()
        with open(save_path, 'ab') as f:
            np.save(f, batch_embeddings)

texts = df['Cleaned_Description'].tolist()
labels = df['Genre_Encoded'].tolist()

In [50]:
# Generuj osadzenia
embeddings = get_bert_embeddings(texts)

Generating embeddings:   0%|          | 0/5422 [00:00<?, ?it/s]

In [52]:
# Funkcja ładująca osadzenia z pliku zapisanego w sposób iteracyjny przy pomocy funkcji powyżej ^
def load_embeddings(file_path):
    embeddings_list = []
    with open(file_path, 'rb') as f:
        while True:
            try:
                embedding = np.load(f, allow_pickle=True)
                if embedding.ndim > 0:  
                    embeddings_list.append(embedding)
            except EOFError:
                break

    if embeddings_list:
        return np.concatenate(embeddings_list, axis=0)
    else:
        return np.array([])  

embeddings = load_embeddings('bert_embeddings_imdb_noprep.npy')
print('Loaded embeddings shape:', embeddings.shape)

Loaded embeddings shape: (54494, 768)


### **Stworzenie modelu**

In [25]:
# Pojedynczy trening (STARE, nie uruchamiaj tego, poniżej właściwe)
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.9, random_state=42, stratify=labels)

clf = svm.SVC(kernel='linear', class_weight='balanced', C=1.0, gamma='scale')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.29      0.39      0.33      1183
           1       0.39      0.36      0.38       531
           2       0.19      0.19      0.19       697
           3       0.21      0.19      0.20       448
           4       0.05      0.03      0.04       238
           5       0.46      0.57      0.51      6702
           6       0.16      0.18      0.17       454
           7       0.69      0.76      0.72     11786
           8       0.60      0.51      0.55     12252
           9       0.19      0.16      0.17       706
          10       0.21      0.13      0.16       291
          11       0.62      0.65      0.63       175
          12       0.09      0.03      0.05       219
          13       0.54      0.51      0.53      1984
          14       0.52      0.48      0.50       658
          15       0.23      0.14      0.17       249
          16       0.03      0.01      0.02       287
          17       0.08    

In [56]:
# Będziemy korzystać tylko z 10% danych w przypadku IMDb bo zestaw jest za duży, i normalnie dopasowanie klasyfikatora do jednej kombinacji zajęłoby kilka godzin
X_sample, _, y_sample, _ = train_test_split(
    embeddings, labels, 
    test_size=0.9, 
    stratify=labels,  
    random_state=42 
)

In [57]:
# Walidacja krzyżowa
clf = make_pipeline(
    StandardScaler(),
    svm.SVC(kernel='linear', class_weight='balanced', C=1.0, gamma='scale')
)

cv = StratifiedKFold(n_splits=10)

scores = cross_val_score(clf, X_sample, y_sample, cv=cv, scoring='f1_macro')

print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())


Cross-validation scores: [0.25401538 0.26637828 0.32582585 0.35851521 0.31246383 0.28247943
 0.31187219 0.28244511 0.2829235  0.28400242]
Average cross-validation score: 0.29609212022639997


In [58]:
# Tu robimy grid search
pipeline = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))

# Parametery jakie będzie sprawdzał grid search
param_grid = {
    'svc__C': [0.1, 1, 10],        # Regularization parameter
    'svc__kernel': ['linear', 'rbf'],  # Type of kernel
    'svc__gamma': ['scale', 'auto']    # Kernel coefficient
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='f1_macro', verbose=3)

grid_search.fit(X_sample, y_sample)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

best_clf = grid_search.best_estimator_


Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.246 total time=   3.7s
[CV 2/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.270 total time=   3.7s
[CV 3/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.315 total time=   3.7s
[CV 4/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.358 total time=   3.7s
[CV 5/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.307 total time=   3.7s
[CV 6/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.283 total time=   3.7s
[CV 7/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.321 total time=   3.6s
[CV 8/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.288 total time=   3.7s
[CV 9/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0.286 total time=   3.7s
[CV 10/10] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear;, score=0

In [59]:
# To samo co 2 komórki wyżej, ale tutaj uruchom sobie z parametrami z grid searcha
clf = make_pipeline(
    StandardScaler(),
    svm.SVC(kernel='linear', class_weight='balanced', C=0.1, gamma='scale')
)

cv = StratifiedKFold(n_splits=10)

scores = cross_val_score(clf, X_sample, y_sample, cv=cv, scoring='f1_macro')

print("Cross-validation scores:", scores)
print("Average cross-validation score:", scores.mean())


Cross-validation scores: [0.24600073 0.27000539 0.31450029 0.35782974 0.3070862  0.28322488
 0.3208149  0.2880291  0.28638605 0.28951194]
Average cross-validation score: 0.29633892235460324


In [None]:
class CustomBertTokenizer(BertTokenizer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def tokenize(self, text):
        tokens = re.findall(r'\w+|\S', text.lower() if self.do_lower_case else text)
        new_tokens = []
        for token in tokens:
            if token in self.vocab:
                new_tokens.append(token)
            else:
                if token not in self.vocab:
                    self.add_tokens([token])
                    new_tokens.append(token)
                    weights = model.embeddings.word_embeddings.weight.data
                    new_weights = torch.cat((weights, weights[101:102]), 0)
                    new_emb = nn.Embedding.from_pretrained(new_weights, padding_idx=0, freeze=False)
                    model.embeddings.word_embeddings = new_emb
                else:
                    new_tokens.append(token)
        return new_tokens
        