In [2]:
import os 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords 
import string 

In [3]:
def read_files(folder_path, label):
    texts = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts, [label] * len(texts) 

astronomy_folder = 'dane/astronomia'
others_folder = 'dane/inne'

astronomy_texts, astronomy_labels = read_files(astronomy_folder, 'astronomia')
others_texts, others_labels = read_files(others_folder, 'inne')

In [4]:
texts = astronomy_texts + others_texts
labels = astronomy_labels + others_labels

data = {'text': texts, 'label':labels}
df = pd.DataFrame(data)

df.to_csv('dane_zbiorcze.csv', index=False)
print('Dane zapisane.')

Dane zapisane.


In [5]:
df.sample(10)

Unnamed: 0,text,label
222,"Teologia (gr. θεος, theos, „Bóg”, + λογος, log...",inne
239,Symbioza (łac. z gr. sym- 'współ-' i gr. bíos ...,inne
20,Landsat 5 - satelita obserwacji Ziemi będący c...,astronomia
45,"STS-5 (ang. ""Space Transportation System"") - p...",astronomia
75,Stefan L. Piotrowski (ur. 11 kwietnia 1910 w K...,astronomia
203,Marzena Kipiel-Sztuka (ur. 19 października 196...,inne
116,"Io (""Jowisz I"") - trzeci co do wielkości księż...",astronomia
94,(88) Thisbe - planetoida z pasa głównego plane...,astronomia
244,National Collegiate Athletic Association (Naro...,inne
50,Jupiter Icy Moons Orbiter (JIMO) - anulowana m...,astronomia


In [6]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()
    return set(stopwords)

stopwords_file = 'polish_stopwords.txt'

polish_stopwords = load_stopwords(stopwords_file)

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in polish_stopwords]
    return ' '.join(words)

df['text'] = df['text'].apply(preprocess_text)

In [7]:
df.sample(5)

Unnamed: 0,text,label
114,messier 5 znana m5 ngc 5904 gromada kulista po...,astronomia
191,dzbanecznik nepenthes l – rodzaj roślin mięsoż...,inne
232,beletrystyka fr belleslettres – literatura pię...,inne
120,skylab 4 trzecia ostatnia załoga bazy satelita...,astronomia
208,dramat satyrowy – starożytna grecka odmiana ko...,inne


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [9]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (224,)
X_test shape:  (56,)
y_train shape:  (224,)
y_test shape:  (56,)


In [11]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [12]:
model_NB = MultinomialNB()
model_NB.fit(X_train_vec, y_train)

y_pred_NB = model_NB.predict(X_test_vec)
print('NB Accuracy:', accuracy_score(y_test, y_pred_NB))
print('NB Classification Report:\n', classification_report(y_test, y_pred_NB))

NB Accuracy: 0.8214285714285714
NB Classification Report:
               precision    recall  f1-score   support

  astronomia       0.75      0.96      0.84        28
        inne       0.95      0.68      0.79        28

    accuracy                           0.82        56
   macro avg       0.85      0.82      0.82        56
weighted avg       0.85      0.82      0.82        56



In [13]:
model_LR = LogisticRegression()
model_LR.fit(X_train_vec, y_train)

y_pred_LR = model_LR.predict(X_test_vec)
print('LR Accuracy:', accuracy_score(y_test, y_pred_LR))
print('LR Classification Report:\n', classification_report(y_test, y_pred_LR))

LR Accuracy: 0.9464285714285714
LR Classification Report:
               precision    recall  f1-score   support

  astronomia       1.00      0.89      0.94        28
        inne       0.90      1.00      0.95        28

    accuracy                           0.95        56
   macro avg       0.95      0.95      0.95        56
weighted avg       0.95      0.95      0.95        56



In [17]:
y_pred_LR[0:10]

array(['astronomia', 'astronomia', 'inne', 'inne', 'inne', 'astronomia',
       'inne', 'inne', 'inne', 'inne'], dtype=object)

In [20]:
X_test.head()

33     korona południowa łac corona australis dop cor...
108    planetozymal planetezymal małe ciało niebieski...
240    mykoryza12 mikoryza345a mycorrhiza – powszechn...
259    hollywood boulevard – ulica hollywood los ange...
154    liczba stopni swobody df ang degrees of freedo...
Name: text, dtype: object

In [24]:
y_pred_LR_df = pd.DataFrame(y_pred_LR, columns=['label'])

new_df = pd.concat([X_test.reset_index(drop=True), y_pred_LR_df], axis=1)
new_df.head()

Unnamed: 0,text,label
0,korona południowa łac corona australis dop cor...,astronomia
1,planetozymal planetezymal małe ciało niebieski...,astronomia
2,mykoryza12 mikoryza345a mycorrhiza – powszechn...,inne
3,hollywood boulevard – ulica hollywood los ange...,inne
4,liczba stopni swobody df ang degrees of freedo...,inne


In [26]:
pred_astronomy_folder = 'predict/astronomia' 
pred_others_folder = 'predict/inne'

os.makedirs(pred_astronomy_folder, exist_ok=True)
os.makedirs(pred_others_folder, exist_ok=True)

def save_text_to_file(text, label, index):
    if label == 'astronomia':
        folder_path = pred_astronomy_folder
    else:
        folder_path = pred_others_folder

    file_path = os.path.join(folder_path, f'text_{index}.txt')

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)


for index, row in new_df.iterrows():
    save_text_to_file(row['text'], row['label'], index)

print('Dane zapisane do plików.')

Dane zapisane do plików.
