## Interim Preprocessing

In [1]:
import pandas as pd
from src.preprocessor import Preprocessor
from src.settings.preprocessor import PreprocessorSettings

In [2]:
prep_settings = PreprocessorSettings(use_normalization=True, use_lemmatization=True)

In [3]:
preprocessor = Preprocessor(prep_settings)

In [8]:
from textblob import TextBlob

In [9]:
blob = TextBlob("rocks")

In [None]:
blob.lem

In [None]:

" ".join([word.lemmatize() for word in blob.words])

In [12]:
preprocessor.lemmatize_text("corpora")

'corpus'

In [4]:
raw_df = pd.read_csv('../data/raw/train.csv', encoding = "ISO-8859-1")

In [5]:
raw_df['SentimentText'] = raw_df['SentimentText'].apply(lambda x: preprocessor.normalize_text(x).strip())

In [13]:
raw_df.to_csv('../data/interim/train.csv', index=False)

## Train Test split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
raw_df = raw_df.rename(columns={'ItemID': "id", "Sentiment": "sentiment", "SentimentText": "text"})

In [24]:
# X_train, X_test, y_train, y_test = train_test_split(raw_df[['id', 'text']], raw_df[['sentiment']], test_size=0.33, random_state=42)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(raw_df['text'], raw_df['sentiment'], test_size=0.33, random_state=42)

In [9]:
X_train.to_csv('../data/processed/x_train.csv', index=False)
X_test.to_csv('../data/processed/x_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

## Modeling

In [1]:
from src.classifier import Classifier
from src.settings.classifier import ClassifierSettings

In [2]:
model_settings = ClassifierSettings(n_estimators=100, max_depth=20, id2label={0: "negative", 1: "positive"})

In [3]:
classifier = Classifier.load("../models/exp_2", model_settings=model_settings)

In [4]:
classifier.predict("some")

PredictOutput(predictions='positive')

In [16]:
classifier.fit(X_train[:10], y_train[:10])

In [29]:
from sklearn.metrics import classification_report
import pandas as pd

predictions = classifier.predict(X_test)

# Оценка результатов
print(classification_report(y_test, predictions))

classifier.save("model.joblib", "vectorizer.joblib")

loaded_classifier = Classifier.load("model.joblib", "vectorizer.joblib", model_settings=model_settings)

# Проверка работоспособности загруженной модели
loaded_predictions = loaded_classifier.predict(X_test)
print(classification_report(y_test, loaded_predictions))


              precision    recall  f1-score   support

           0       0.74      0.67      0.70     14470
           1       0.76      0.82      0.79     18527

    accuracy                           0.75     32997
   macro avg       0.75      0.74      0.74     32997
weighted avg       0.75      0.75      0.75     32997

              precision    recall  f1-score   support

           0       0.74      0.67      0.70     14470
           1       0.76      0.82      0.79     18527

    accuracy                           0.75     32997
   macro avg       0.75      0.74      0.74     32997
weighted avg       0.75      0.75      0.75     32997

