In [122]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.utils import resample,shuffle  # Import resample function


In [100]:
nltk.download('stopwords')
nltk.download('punkt')
# distinguishes stopwords 
# implements the punkt sentence tokenizer algorithm to break up sentences.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maadh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [101]:
num_rows_to_load = 1000
df = pd.read_csv("C:\\Users\\maadh\\Downloads\\twitter_training.csv.zip", nrows = num_rows_to_load)


     2401  Borderlands  Positive  \
0    2401  Borderlands  Positive   
1    2401  Borderlands  Positive   
2    2401  Borderlands  Positive   
3    2401  Borderlands  Positive   
4    2401  Borderlands  Positive   
..    ...          ...       ...   
995  2577  Borderlands  Positive   
996  2577  Borderlands  Positive   
997  2577  Borderlands  Positive   
998  2577  Borderlands  Positive   
999  2577  Borderlands  Positive   

    im getting on borderlands and i will murder you all ,  
0    I am coming to the borders and I will kill you...     
1    im getting on borderlands and i will kill you ...     
2    im coming on borderlands and i will murder you...     
3    im getting on borderlands 2 and i will murder ...     
4    im getting into borderlands and i can murder y...     
..                                                 ...     
995              Who's down for some @Borderlands on       
996                    Who's on for some @ Borderlands     
997                        

In [103]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [word.lower() for word in tokens]
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join tokens back to string
    return ' '.join(tokens)


In [104]:
df.columns

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [105]:
df.rename(columns={'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df.rename(columns={'Positive': 'sentiment'}, inplace=True)

df['text'] = df['text'].apply(preprocess_text)

In [106]:
#Error was that The first letters of each sentiment weren't capitalized
sentiment_mapping = {'Negative': 0, 'Positive': 1, 'Neutral': 2}
df['sentiment'] = df['sentiment'].map(sentiment_mapping)

In [107]:
print("Unique sentiment values before cleanup:", df['sentiment'].unique())

# Remove rows with invalid labels
valid_labels = [0, 1, 2]
df = df[df['sentiment'].isin(valid_labels)]

print("Unique sentiment values after cleanup:", df['sentiment'].unique())

Unique sentiment values before cleanup: [ 1.  2.  0. nan]
Unique sentiment values after cleanup: [1. 2. 0.]


In [124]:
print(df['sentiment'].value_counts())

# Balance the dataset if necessary
df_negative = df[df['sentiment'] == 0]
df_positive = df[df['sentiment'] == 1]
df_neutral = df[df['sentiment'] == 2]

df_positive_resampled = resample(df_positive, replace=True, n_samples=len(df_negative), random_state=42)
df_neutral_resampled = resample(df_neutral, replace=True, n_samples=len(df_negative), random_state=42)

df_balanced = pd.concat([df_negative, df_positive_resampled, df_neutral_resampled])

sentiment
1.0    423
2.0    279
0.0    192
Name: count, dtype: int64


In [125]:
df_balanced = shuffle(df_balanced, random_state=42)



In [126]:
train_data, test_data, train_labels, test_labels = train_test_split(
    df['text'], df['sentiment'], test_size=0.2, random_state=42
)

In [127]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data)


In [128]:
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [129]:
train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=256)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=256)


In [130]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=10000, output_dim=16, input_length=256),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(16, activation='relu'),
    layers.Dense(3, activation='softmax')
])



In [131]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Change loss function for multi-class classification
              metrics=['accuracy'])


In [132]:

history = model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels), batch_size=512, verbose=1)
#input values aren't normalized which is causing error


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [133]:
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print(f'Test Accuracy: {test_acc}')

# Make predictions
predictions = model.predict(test_padded)
predicted_labels = np.argmax(predictions, axis=1)

Test Accuracy: 0.5083798766136169


In [134]:
reverse_sentiment_mapping = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}

# Convert numeric predictions to sentiment strings
predicted_sentiments = [reverse_sentiment_mapping[label] for label in predicted_labels]

# Print or save the results
results_df = pd.DataFrame({
    'Text': test_data,
    'Predicted Sentiment': predicted_sentiments,
    'Actual Sentiment': test_labels.map(reverse_sentiment_mapping)
})

print(results_df.head())

                                                  Text Predicted Sentiment  \
799      really like randy pitchford , 's helped lot .            Positive   
489  tales behind borderlands swaggiedeals . com / ...            Positive   
606  gamespot : borderlands 3 's dlc rejects major ...            Positive   
816  went bed 4am . 5 hours earlier 9pm graveward f...            Positive   
39   man gearbox really needs fix dissapointing dro...            Positive   

    Actual Sentiment  
799          Neutral  
489          Neutral  
606          Neutral  
816         Negative  
39          Negative  
