# Loading dataset

In [2]:
# File path
file_path = 'imdb_nolabel.txt'


def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return lines

data_lines = load_data(file_path)

# Data Preprocessing

In [3]:
import re
import nltk
from nltk.tokenize import word_tokenize

# Change all words to lowercase
# Remove non-alphabetic characters and numbers
def clean_text(text):
    text = text.lower()
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

nltk.download('punkt')

# Apply tokenize
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Get the sentence after preprocessing
cleaned_data = [clean_text(line) for line in data_lines]
tokenized_data = [tokenize_text(line) for line in cleaned_data]
tokenized_texts = [' '.join(tokens) for tokens in tokenized_data]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\59158\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Feature Extraction

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize TF-IDF
tfidf_vectorizer = TfidfVectorizer()

# Convert text data to TF-IDF feature matrix
features_tfidf = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_data])

# Print TF-IDF feature matrix shape
print("TF-IDF shape：", features_tfidf.shape)


def extract_last_digit(input_file):
    try:
        last_digits = []  # Stores a list of extracted end-of-line characters
        with open(input_file, 'r') as f_input:
            for line in f_input:
                line = line.strip()  # Remove newlines and spaces at the end of lines
                if line and (line.endswith('0') or line.endswith('1')):
                    # Extract characters at end of line and convert to integer
                    last_digits.append(int(line[-1]))  

        # Convert extracted character list to NumPy array
        last_digits_array = np.array(last_digits)
        return last_digits_array
    except Exception as e:
        print(f"Error：{e}")
        return None

input_file ='imdb_labelled.txt'
labels = extract_last_digit(input_file)

# Divide the data set into training set and test set
X_train, X_test, y_train, y_test = train_test_split(features_tfidf, labels, test_size=0.1, random_state=42)



TF-IDF shape： (1000, 3113)


# Convolutional Neural Network (CNN)

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

X_train_dense = X_train.toarray()  
X_test_dense = X_test.toarray()

model = Sequential()
model.add(Reshape((X_train_dense.shape[1], 1), input_shape=(X_train_dense.shape[1],)))  # Reshape for 1D convolution
model.add(Conv1D(64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Or 'softmax'

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_dense, y_train, epochs=6, batch_size=32)

y_pred = (model.predict(X_test_dense) > 0.5).astype("int32")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')  # Use binary for binary classification
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

# Print the metrics
print("Accuracy: {:.2f}".format(accuracy))
print("F1 Score: {:.2f}".format(f1))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))



Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Accuracy: 0.77
F1 Score: 0.78
Precision: 0.85
Recall: 0.71


Completed!