Importing libraries:
- os - for reading and saving files
- pandas - for working with dataset
- scikit-learn - for machine learning
- string - for removing punctuation
- joblib - for saving model and vectorizer

In [1]:
import os 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import string 
import joblib

Reading files from folders "astronomia" and "inne" and adding labels from folders name.

In [2]:
def read_files(folder_path, label):
    texts = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts, [label] * len(texts) 

astronomy_folder = 'dane/astronomia'
others_folder = 'dane/inne'

astronomy_texts, astronomy_labels = read_files(astronomy_folder, 'astronomia')
others_texts, others_labels = read_files(others_folder, 'inne')

Creating DataFrame and .csv file.

In [3]:
texts = astronomy_texts + others_texts
labels = astronomy_labels + others_labels

data = {'text': texts, 'label':labels}
df = pd.DataFrame(data)

df.to_csv('dane_zbiorcze.csv', index=False)
print('Dane zapisane.')

Dane zapisane.


In [4]:
df.sample(10)

Unnamed: 0,text,label
180,Teoria strun (TS) - koncepcja w fizyce teorety...,inne
261,Stade Roland Garros – kompleks sportowy w Pary...,inne
40,"Układ planetarny, system planetarny - planety ...",astronomia
69,V2467 Cygni (również Nova Cygni 2007) - polar ...,astronomia
285,"Wodospad – swobodny, pionowy spadek wody rzecz...",inne
283,"Nysa Kłodzka (cz. Kladská Nisa, niem. Glatzer ...",inne
318,Fredrick Martin MacMurray (ur. 30 sierpnia 190...,inne
327,Cezary Andrzej Pazura (ur. 13 czerwca 1962[1] ...,inne
0,Astronomia - miesięcznik skierowany przede wsz...,astronomia
216,Dawid i Goliat - obraz olejny namalowany w lat...,inne


Cleaning texts from punctuation, polish stopwords and capital letters.

In [5]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()
    return set(stopwords)

stopwords_file = 'polish_stopwords.txt'

polish_stopwords = load_stopwords(stopwords_file)

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in polish_stopwords]
    return ' '.join(words)

df['text'] = df['text'].apply(preprocess_text)

In [6]:
df.sample(5)

Unnamed: 0,text,label
312,kodeks cywilny skrót kc języku prawniczym kc –...,inne
90,57 mnemosyne planetoida pasa głównego planetoi...,astronomia
7,teon aleksandrii ur ok 335 zm ok 405 matematyk...,astronomia
224,kolory holi stworzony naturalnych składników n...,inne
103,nano saild2 mały satelita eksperymentalny zbud...,astronomia


Splitting data into training and testing.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [8]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (272,)
X_test shape:  (68,)
y_train shape:  (272,)
y_test shape:  (68,)


Vectorizing the data.

In [9]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

Training Naive Bayes model and making prediction. Evaluating the model.

In [10]:
model_NB = MultinomialNB()
model_NB.fit(X_train_vec, y_train)

y_pred_NB = model_NB.predict(X_test_vec)
print('NB Accuracy:', accuracy_score(y_test, y_pred_NB))
print('NB Classification Report:\n', classification_report(y_test, y_pred_NB))

NB Accuracy: 0.8088235294117647
NB Classification Report:
               precision    recall  f1-score   support

  astronomia       0.73      1.00      0.85        36
        inne       1.00      0.59      0.75        32

    accuracy                           0.81        68
   macro avg       0.87      0.80      0.80        68
weighted avg       0.86      0.81      0.80        68



Training Logistic Regression model and making prediction. Evaluating the model.

In [11]:
model_LR = LogisticRegression()
model_LR.fit(X_train_vec, y_train)

y_pred_LR = model_LR.predict(X_test_vec)
print('LR Accuracy:', accuracy_score(y_test, y_pred_LR))
print('LR Classification Report:\n', classification_report(y_test, y_pred_LR))

LR Accuracy: 0.9852941176470589
LR Classification Report:
               precision    recall  f1-score   support

  astronomia       1.00      0.97      0.99        36
        inne       0.97      1.00      0.98        32

    accuracy                           0.99        68
   macro avg       0.98      0.99      0.99        68
weighted avg       0.99      0.99      0.99        68



The Logistic Regression model made predictions better.

In [12]:
y_pred_LR[0:5]

array(['inne', 'astronomia', 'astronomia', 'astronomia', 'astronomia'],
      dtype=object)

In [13]:
X_test.head()

284    natura 2000 – program sieci obszarów objętych ...
116    io jowisz trzeci wielkości księżyc jowisza gru...
113    messier 4 znana m4 ngc 6121 gromada kulista gw...
42     biały karzeł niewielki rzędu rozmiarów ziemi o...
126    gerald kron ur 6 kwietnia 1913 zm 9 kwietnia 2...
Name: text, dtype: object

Saving vectorizer and model.

In [14]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(model_LR, 'logistic_regression_model.pkl')

print('Pliki zostały zapisane.')

Pliki zostały zapisane.


Validating the model on new data.

In [15]:
val_df = pd.read_csv('dane_walidacyjne.csv', encoding='utf-8', sep=';')
val_df_clean = val_df.copy()
val_df_clean['text'] = val_df['text'].apply(preprocess_text)
val_df_clean.head()

Unnamed: 0,text,label
0,1992 astronomowie podejrzewali posiadali jedno...,astronomia
1,1000 puzzle delle formiche „mrówkowe” obrazek ...,inne
2,c2020 f3 neowise – kometa długookresowa odkryt...,astronomia
3,planety powinny fotografowanie możliwie najwyż...,astronomia
4,niewidoma marielaure mieszka paryżu okolicy mu...,inne


In [16]:
val_df_clean.shape

(20, 2)

In [17]:
X_val = val_df_clean['text']
y_val = val_df_clean['label']
X_val_vec = vectorizer.transform(X_val)

y_val_pred_LR = model_LR.predict(X_val_vec)
print('LR Accuracy:', accuracy_score(y_val, y_val_pred_LR))
print('LR Classification Report:\n', classification_report(y_val, y_val_pred_LR))

LR Accuracy: 0.7
LR Classification Report:
               precision    recall  f1-score   support

  astronomia       1.00      0.40      0.57        10
        inne       0.62      1.00      0.77        10

    accuracy                           0.70        20
   macro avg       0.81      0.70      0.67        20
weighted avg       0.81      0.70      0.67        20



In [18]:
y_val_pred_df = pd.DataFrame(y_val_pred_LR, columns=['predicted_label'])
val_pred_df = pd.concat([val_df['text'], val_df['label'], y_val_pred_df], axis=1)
val_pred_df

Unnamed: 0,text,label,predicted_label
0,"Do roku 1992 astronomowie podejrzewali, ale ni...",astronomia,astronomia
1,1000 Puzzle delle Formiche czyli „mrówkowe” Ob...,inne,inne
2,"C/2020 F3 (NEOWISE) – kometa długookresowa, od...",astronomia,astronomia
3,Planety powinny być fotografowanie możliwie ja...,astronomia,inne
4,Niewidoma Marie-Laure mieszka w Paryżu w okoli...,inne,inne
5,Bieszczady miały być dla Wiktora Forsta azylem...,inne,inne
6,Nowa gazetka już dostępna!\nZ radością informu...,inne,inne
7,Nie każdy może lub chce zabrać swój teleskop i...,astronomia,inne
8,Księżyc w fazie 3 doby i 7 godzin po nowiu. Ba...,astronomia,inne
9,Wentworth 1000 „Soulmates”\nPrzecudowne puzzle...,inne,inne


In [19]:
from sklearn.ensemble import RandomForestClassifier

model_RF = RandomForestClassifier(n_estimators=100, random_state=42)
model_RF.fit(X_train_vec, y_train)
y_pred_RF = model_RF.predict(X_test_vec)

print('RF Accuracy:', accuracy_score(y_test, y_pred_RF))
print('RF Classification Report:\n', classification_report(y_test, y_pred_RF))

RF Accuracy: 0.9411764705882353
RF Classification Report:
               precision    recall  f1-score   support

  astronomia       0.97      0.92      0.94        36
        inne       0.91      0.97      0.94        32

    accuracy                           0.94        68
   macro avg       0.94      0.94      0.94        68
weighted avg       0.94      0.94      0.94        68



In [20]:
y_val_pred_RF = model_RF.predict(X_val_vec)
print('RF Accuracy:', accuracy_score(y_val, y_val_pred_RF))
print('RF Classification Report:\n', classification_report(y_val, y_val_pred_RF))

RF Accuracy: 0.6
RF Classification Report:
               precision    recall  f1-score   support

  astronomia       1.00      0.20      0.33        10
        inne       0.56      1.00      0.71        10

    accuracy                           0.60        20
   macro avg       0.78      0.60      0.52        20
weighted avg       0.78      0.60      0.52        20

