# Новый раздел

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import pandas as pd


# 1. Load the Dataset
df = pd.read_csv('Reviews.csv')  # Ensure the correct path

# 2. Drop Unnecessary Columns
df = df.drop(columns=['column_name1', 'column_name2'], errors='ignore')  # Avoid errors if columns are missing

# 3. Combine Text Fields
df['combined_text'] = df['Summary'].fillna('') + " " + df['Text'].fillna('')

# 4. Tokenize Text Data
tokenizer = Tokenizer(num_words=10_000)
tokenizer.fit_on_texts(df['combined_text'])
sequences = tokenizer.texts_to_sequences(df['combined_text'])

# 5. Pad Sequences
padded_sequences = pad_sequences(sequences, maxlen=500)

# 6. Map Scores (if applicable)
df['score'] = df['Score'].map({5: 1, 4: 1, 3: 0, 2: 0, 1: 0})  # Ensure column name is correct

# 7. Handle Class Imbalance
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(padded_sequences, df['score'].dropna())

# 8. Encode Labels
y_categorical = to_categorical(y_resampled, num_classes=2)

# 9. Split Data into Training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_categorical, test_size=0.2, random_state=42)

print("Preprocessing complete!")

Preprocessing complete!


[[   0    0    0 ...  278   12 7492]
 [   0    0    0 ...  106 3617  297]
 [   0    0    0 ...   93  282    6]
 ...
 [   0    0    0 ...   90   62  411]
 [   0    0    0 ... 2135   12    6]
 [   0    0    0 ...  722    1  650]]


## Algorithm 2 CNN Model for text classification


In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense


In [None]:
from imblearn.under_sampling import RandomUnderSampler  # Import this line

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# 1: Load the Dataset
df = pd.read_csv('Reviews.csv')  # Ensure the correct path

# 2: Drop Unnecessary Columns
df = df.drop(columns=['column_name1', 'column_name2'], errors='ignore')

# 3: Combine Text Fields
df['combined_text'] = df['Summary'].fillna('') + " " + df['Text'].fillna('')

# 4: Tokenize Text Data
vocab_size = 10_000
max_sequence_length = 500  # Max length for sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['combined_text'])
sequences = tokenizer.texts_to_sequences(df['combined_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# 5: Map Scores (Binary Classification)
df['score'] = df['Score'].map({5: 1, 4: 1, 3: 0, 2: 0, 1: 0})  # Ensure column is mapped correctly

# Drop any rows with missing scores after mapping
df = df.dropna(subset=['score'])

# 6: Handle Imbalanced Classes
ros = RandomUnderSampler(random_state=42)  # Corrected import
X_resampled, y_resampled = ros.fit_resample(padded_sequences, df['score'])

# 7: Encode Labels for Categorical Crossentropy
num_classes = 2
y_categorical = to_categorical(y_resampled, num_classes=num_classes)

# 8: Split Data
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_categorical, test_size=0.2, random_state=42)

# 9: Initialize Model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))  # num_classes must be set properly

# Compile the Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the Model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

# Evaluate Model
loss, acc = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {acc * 100:.2f}%")




Epoch 1/10
[1m5947/6234[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1:12[0m 253ms/step - accuracy: 0.8571 - loss: 0.3147