In [5]:
import spacy
from spacy import displacy

from preprocessing import (
    WordCaser,
    PolishLemmatizer,
    PolishLetterReplacer,
    PunctuationRemover,
    StopWordsRemover,
    CurrencyRemover,
    UnitsRemover,
    NumberReplacer,
    WhitespaceRemover
)

nlp = spacy.load('pl_core_news_lg')

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [6]:
dataset=pd.read_csv('polish_dataset.csv')
dataset.head()

Unnamed: 0,description,length,rate
0,Polecam nie pierwszy i nie ostatni raz!,39.0,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,121.0,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,87.0,1.0
3,0,0.0,0.0
4,Jestem w pełni zadowolona z przebiegu transakcji,48.0,1.0


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936883 entries, 0 to 936882
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   description  936254 non-null  object 
 1   length       762836 non-null  float64
 2   rate         936817 non-null  float64
dtypes: float64(2), object(1)
memory usage: 21.4+ MB


In [8]:
del dataset['length']
dataset.dropna(inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 936188 entries, 0 to 936882
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   description  936188 non-null  object 
 1   rate         936188 non-null  float64
dtypes: float64(1), object(1)
memory usage: 21.4+ MB


In [9]:
dataset['rate'].value_counts()

 1.0    734250
-1.0    183391
 0.0     18547
Name: rate, dtype: int64

In [10]:
dataset.loc[dataset['rate'] == 1]

Unnamed: 0,description,rate
0,Polecam nie pierwszy i nie ostatni raz!,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,1.0
4,Jestem w pełni zadowolona z przebiegu transakcji,1.0
5,Transakcja przebiegła błyskawicznie. Pełen pro...,1.0
...,...,...
762863,"Sklep fajny, robię w nim dużo zakupów.",1.0
762864,Bardzo szybka realizacja zamówień. Polecam.,1.0
762865,"Wszystko w porządku, polecam !!",1.0
762866,"Interesująca oferta, czytelna kategoryzacja pr...",1.0


In [11]:
dataset.loc[dataset['rate'] == 0]

Unnamed: 0,description,rate
3,0,0.0
16,0,0.0
34,0,0.0
76,0,0.0
131,0,0.0
...,...,...
762814,0,0.0
762816,0,0.0
762842,0,0.0
762852,0,0.0


In [12]:
dataset.loc[dataset['rate'] == -1]

Unnamed: 0,description,rate
15,Zamówiony produkt dostałem w częściach do złoż...,-1.0
1548,"począwszy od problemów z komunikacją, poprzez ...",-1.0
1645,Sklep- tragedia \r\r\r\nzakup był zrobiony na ...,-1.0
2314,"Dział reklamacji nie działa. Bardzo niechętny,...",-1.0
3904,"Tragedia, nie można doprosić się o fakturę, mi...",-1.0
...,...,...
936878,Coraz lepiej wygląda,-1.0
936879,JA SRAM NA TEN PIERSCIONEK I NA CIEBIE CHWILE ...,-1.0
936880,Rafatus do Marleny Ty kurwo bez honoru ...,-1.0
936881,matka Marleny prosi o pomoc,-1.0


In [13]:
dataset = dataset[dataset.rate != 0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 917641 entries, 0 to 936882
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   description  917641 non-null  object 
 1   rate         917641 non-null  float64
dtypes: float64(1), object(1)
memory usage: 21.0+ MB


In [15]:
X, y = dataset['description'], dataset['rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

additional_punctuation = ['\\.', '\\+', '/', '%']

tfidf_max_df = 1.0
tfidf_min_df = 5

preprocessing_pipe = make_pipeline(
    WordCaser(),
    PolishLemmatizer(),
    PolishLetterReplacer(),
    PunctuationRemover(additional_punctuation=additional_punctuation),
    StopWordsRemover(),
    CurrencyRemover(),
    UnitsRemover(),
    NumberReplacer(),
    WhitespaceRemover(),
    TfidfVectorizer(max_df=tfidf_max_df, min_df=tfidf_min_df)
)

X_train_transformed = preprocessing_pipe.fit_transform(X_train.values)
X_test_transformed = preprocessing_pipe.transform(X_test.values)


In [24]:
y_train= y_train.map({-1: 0, 1: 1})

In [26]:
y_test= y_test.map({-1: 0, 1: 1})

In [30]:
#####################################################
model_2 =RandomForestClassifier()
model_2.fit(X_train_transformed, y_train.values)
predicted_categories = model_2.predict(X_test_transformed)
predicted_categories_df = pd.Series(
    predicted_categories,
    name=f'{y_test.name}_pred',
    index=y_test.index
)
model_accuracy = accuracy_score(y_test.values, predicted_categories)
print('Random Forest accuracy: ',model_accuracy)
print('Macierz pomyłek Random forest')
print(confusion_matrix(y_test.values, predicted_categories))
print('-----------------------------------------------------')

#####################################################################
model_6 =XGBClassifier(eval_metric='error',use_label_encoder=False)
model_6.fit(X_train_transformed, y_train.values)
predicted_categories = model_6.predict(X_test_transformed)
predicted_categories_df = pd.Series(
    predicted_categories,
    name=f'{y_test.name}_pred',
    index=y_test.index
)

model_accuracy = accuracy_score(y_test.values, predicted_categories)
print('XGB accuracy: ',model_accuracy)
print('Macierz pomyłek xg_boost')
print(confusion_matrix(y_test.values, predicted_categories))

Random Forest accuracy:  0.9853379212870927
Macierz pomyłek Random forest
[[ 57258   3335]
 [  1105 241124]]
-----------------------------------------------------
XGB accuracy:  0.9776205163429341
Macierz pomyłek xg_boost
[[ 58284   2309]
 [  4468 237761]]
