Importing libraries:
- os - for reading and saving files
- joblib - for loading model and vectorizer
- pandas - for working with dataset
- string - for removing punctuation

In [12]:
import os 
import joblib
import pandas as pd
import string 

Reading text files from folder.

In [13]:
def read_files(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts

folder_path = 'nowe_dane'
data_text = read_files(folder_path)

Creating DataFrame and .csv file.

In [14]:
texts = data_text
labels = ''

data = {'text': texts, 'label': labels}

df = pd.DataFrame(data)

df.to_csv('nowe_dane_zbiorcze.csv', index=False)
print('Dane zapisane.')

Dane zapisane.


In [15]:
df.sample()

Unnamed: 0,text,label
2,Planety powinny być fotografowanie możliwie ja...,


Cleaning texts from punctuation, polish stopwords and capital letters.

In [16]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()
    return set(stopwords)

stopwords_file = 'polish_stopwords.txt'

polish_stopwords = load_stopwords(stopwords_file)

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in polish_stopwords]
    return ' '.join(words)

df['text'] = df['text'].apply(preprocess_text)

In [17]:
df.sample()

Unnamed: 0,text,label
8,wentworth 1000 „soulmates” przecudowne puzzle ...,


Loading model and vectorizer.

In [18]:
model = joblib.load('logistic_regression_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

Vectorizing the data.

In [19]:
vec_data = vectorizer.transform(df['text'])

Predicting labels.

In [20]:
predictions = model.predict(vec_data)

Loading original texts and adding labels.

In [21]:
df_pred = pd.read_csv('nowe_dane_zbiorcze.csv', encoding='utf-8')
df_pred['label'] = predictions
df_pred.head()

Unnamed: 0,text,label
0,"Do roku 1992 astronomowie podejrzewali, ale ni...",astronomia
1,#Akena 1000 Puzzle delle Formiche czyli „mrów...,inne
2,Planety powinny być fotografowanie możliwie ja...,inne
3,Niewidoma Marie-Laure mieszka w Paryżu w okoli...,inne
4,Bieszczady miały być dla Wiktora Forsta azylem...,inne


Saving the data in files in appropriate folders.

In [22]:
pred_astronomy_folder = 'predict/astronomia' 
pred_others_folder = 'predict/inne'

os.makedirs(pred_astronomy_folder, exist_ok=True)
os.makedirs(pred_others_folder, exist_ok=True)

def save_text_to_file(text, label, index):
    if label == 'astronomia':
        folder_path = pred_astronomy_folder
    else:
        folder_path = pred_others_folder

    file_path = os.path.join(folder_path, f'text_{index}.txt')

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)


for index, row in df_pred.iterrows():
    save_text_to_file(row['text'], row['label'], index)

print('Dane zapisane do plików.')

Dane zapisane do plików.
