In [3]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

data_train = pd.read_csv('twitter_training.csv', encoding='latin-1')
data_test = pd.read_csv('twitter_test.csv', encoding='latin-1')
data_validation = pd.read_csv('twitter_validation.csv', encoding='latin-1')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def preprocess_text(text):
    if isinstance(text, str): # Check if text is a string
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Zآ-ی]', ' ', text)

        # Convert to lowercase
        text = text.lower()

        # Tokenize the text
        tokens = nltk.word_tokenize(text)

        # Remove stopwords
        stopwords_list = stopwords.words('english')
        tokens = [token for token in tokens if token not in stopwords_list]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Join the tokens back into a single string
        text = ' '.join(tokens)
    return text

data_train['Tweet content'] = data_train['Tweet content'].apply(preprocess_text)
data_test['Tweet content'] = data_test['Tweet content'].apply(preprocess_text)
data_validation['Tweet content'] = data_validation['Tweet content'].apply(preprocess_text)

In [6]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(data_train['sentiment'])
y_test = label_encoder.transform(data_test['sentiment'])
y_validation = label_encoder.transform(data_validation['sentiment'])

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace NaN values with an empty string
data_train['Tweet content'].fillna('', inplace=True)
data_test['Tweet content'].fillna('', inplace=True)
data_validation['Tweet content'].fillna('', inplace=True)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data_train['Tweet content'])
X_test = vectorizer.transform(data_test['Tweet content'])
X_validation = vectorizer.transform(data_validation['Tweet content'])

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# ...

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# ...

X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()

# ...

history = model.fit(X_train_dense, y_train, batch_size=64, epochs=10, validation_data=(X_val_dense, y_val))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
predicted_sentiments = label_encoder.inverse_transform(predictions.flatten())
data_test['Predicted Sentiment'] = predicted_sentiments
data_test.to_excel('neural_network_predictions.xlsx', index=False)