In [1]:
import pandas as pd
import numpy as np
import json
from pymorphy2 import MorphAnalyzer
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import openpyxl

In [2]:
morph = MorphAnalyzer()
vectorizer = TfidfVectorizer()
stop_words = set(stopwords.words('russian'))

In [3]:
n = ['id', 'date', 'name', 'text', 'typr', 'rep', 'rtw', 'faw', 'stcount', 'foll', 'frien', 'listcount']
df_positive = pd.read_csv(r"C:/Users/MPls/prod.jupiter/CNN-model/dataset/positive.csv", sep=';', names=n, usecols=['text'])
df_negative = pd.read_csv(r"C:/Users/MPls/prod.jupiter/CNN-model/dataset/negative.csv", sep=';', names=n, usecols=['text'])

In [4]:
toxic_com = pd.read_excel(r"C:\Users\MPls\prod.jupiter\CNN-model\toxic.xlsx", names=['text', 'label'])

In [5]:
toxic_positive = toxic_com['text'].loc[toxic_com['label'] == 0]
toxic_negative = toxic_com['text'].loc[toxic_com['label'] == 1]

In [6]:
tweets = []
with open(r"C:\Users\MPls\prod.jupiter\CNN-model\dataset\train.jsonl", 'r') as file:
    for line in file:
        tweets.append(json.loads(line))
tweets = pd.DataFrame(tweets)

In [7]:
text_positive = tweets['text'].loc[tweets['label'] == 0]
text_positive = text_positive[0:40145]
text_negative = tweets['text'].loc[tweets['label'] == 1]

In [8]:
text_negative

1         понятно что это нарушение правил, писать капсл...
5         правильно! это же тихановская 26 лет растила и...
13                                         на хуй, безликая
16                      дебилов хватает.надо было с головой
30                        умник хуев. у каждого своё мнение
                                ...                        
223433    да таких педофилов надо уничтожать,что другим ...
223437                          ему 84 будет,осенью.дебилы.
223438    гта игра, а если ты не знающий, то рот бы не о...
223444                                кастрировать пидераса
223448    лайкать не стыдно. а вот пишут (лайкать стыдно...
Name: text, Length: 40145, dtype: object

In [9]:
train = pd.read_json(r"C:\Users\MPls\prod.jupiter\CNN-model\dataset\train.json")

In [10]:
train_positive = train['text'].loc[train['sentiment'] == 'positive']
train_negative = train['text'].loc[train['sentiment'] == 'negative']

In [11]:
data_positive = pd.concat([df_positive, text_positive, train_positive, toxic_positive])
data_negative = pd.concat([df_negative, text_negative, train_negative, toxic_negative])

In [12]:
print(f"positive:",{len(data_positive)}, ";negative:",{len(data_negative)})

positive: {167437} ;negative: {158328}


In [13]:
data_positive = data_positive.drop_duplicates()
data_negative = data_negative.drop_duplicates()

In [14]:
print(f"positive:",{len(data_positive)}, ";negative:",{len(data_negative)})

positive: {162922} ;negative: {153447}


In [15]:
data_positive= data_positive[data_positive['text'].str.count(' ') > 0].reset_index(drop=True)
data_positive= data_positive[data_positive['text'].str.count(' ') < 25].reset_index(drop=True)
data_negative= data_negative[data_negative['text'].str.count(' ') > 0].reset_index(drop=True)
data_negative= data_negative[data_negative['text'].str.count(' ') < 25].reset_index(drop=True)

In [16]:
print(f"positive:",{len(data_positive)}, ";negative:",{len(data_negative)})

positive: {151213} ;negative: {145245}


In [17]:
sample_size = min(data_positive.shape[0], data_negative.shape[0])
raw_data = np.concatenate((data_positive['text'].values[:sample_size],
                           data_negative['text'].values[:sample_size]), axis=0)
labels = [1] * sample_size + [0] * sample_size

In [18]:
def preprocess_text(text):
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    text = re.sub('@[^\s]+', ' ', text)
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('rt', '', text)
    return text.strip()

In [19]:
data = [preprocess_text(t) for t in raw_data]

In [20]:
def lemmatize_words(text):
    words = text.split()
    lemmas = []
    for word in words:
        if word not in stop_words:
            lemma = morph.parse(word)[0].normal_form
            lemmas.append(lemma)
    return " ".join(lemmas)

df = []

In [21]:
for row in data:
    df.append(lemmatize_words(row))

In [22]:
with open("C:/Users/MPls/prod.jupiter/CNN-model/train_text.txt", 'w', encoding='utf-8') as output:
    for row in df:
        output.write(str(row) + '\n')

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2)

In [24]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [25]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [26]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7505766119315639
              precision    recall  f1-score   support

           0       0.77      0.71      0.74     29211
           1       0.73      0.79      0.76     28887

    accuracy                           0.75     58098
   macro avg       0.75      0.75      0.75     58098
weighted avg       0.75      0.75      0.75     58098



In [39]:
pickle.dump(vectorizer, open("C:/Users/MPls/prod.jupiter/CNN-model/models/vectorizer.pickle", "wb"))

In [27]:
with open('C:/Users/MPls/prod.jupiter/CNN-model/models/logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [28]:
# Загрузка модели логистической регрессии из файла
with open('C:/Users/MPls/prod.jupiter/CNN-model/models/logistic_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [29]:
# Использование загруженной модели для предсказания
text = "хватит бесит надоело "
text_vector = vectorizer.transform([text])  # Преобразование текста в вектор с использованием TF-IDF
prediction = loaded_model.predict(text_vector)
sentiment = "Положительный" if prediction[0] == 1 else "Негативный"
print(sentiment)

Положительный


In [30]:
def ton(text):
    if text is not None:
        text = lemmatize_words(text)
        text_vector = vectorizer.transform([text])
        prediction = loaded_model.predict(text_vector)
    else:
        prediction = ''
    return prediction

In [31]:
tg_com = pd.read_table("C:/Users/MPls/prod.jupiter/CNN-model/db_text.txt", sep='/n', engine='python', names=['text'])

In [32]:
tg_com['sentiment'] = [ton(com) for com in tg_com['text']]

In [33]:
tg_com.count()

text         1058622
sentiment    1058622
dtype: int64

In [34]:
tg_com= tg_com[tg_com['text'].str.count(' ') > 0].reset_index(drop=True)
tg_com= tg_com[tg_com['text'].str.count(' ') < 30].reset_index(drop=True)

In [35]:
tg_com.count()

text         721507
sentiment    721507
dtype: int64

In [36]:
tg_com.to_excel(r"C:\Users\MPls\prod.jupiter\CNN-model\tg_com.xlsx")

In [37]:
def use_model(text):
    result = 'положительный' if ton(text) == 1 else 'отрицательный'
    return result

In [38]:
use_model('собака сутулая')

'отрицательный'