In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

data = pd.read_csv('sentiment_text_data.csv', encoding='latin-1', header=None, 
                   names=['polarity', 'id', 'date', 'query', 'user', 'text'])
print(data.head())

   polarity          id                          date     query  \
0         0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1         0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2         0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3         0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4         0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [3]:
data.drop(['id', 'date', 'query', 'user'], axis=1, inplace=True)

In [4]:
print(data.head())

   polarity                                               text
0         0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1         0  is upset that he can't update his Facebook by ...
2         0  @Kenichan I dived many times for the ball. Man...
3         0    my whole body feels itchy and like its on fire 
4         0  @nationwideclass no, it's not behaving at all....


In [5]:
import re

In [6]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\S+", "", text)     # Remove mentions
    text = re.sub(r"#\S+", "", text)     # Remove hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetical 

    return text.lower()  # Convert to lowercase

In [7]:
data['text'] = data['text'].apply(clean_text)

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sohn31/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [10]:
print(data.head())

   polarity                                               text
0         0  awww thats bummer shoulda got david carr third...
1         0  upset cant update facebook texting might cry r...
2         0  dived many times ball managed save rest go bounds
3         0                   whole body feels itchy like fire
4         0                           behaving im mad cant see


In [11]:
print(data.tail(100))

         polarity                                               text
1599900         4  goal stocks like mtxx help one person avoided ...
1599901         4           thats im thinking knock water satisfying
1599902         4  yeah remotes car couldnt find mine didnt know ...
1599903         4  post le mans pics didnt really shoot much bit ...
1599904         4                                       traitor love
...           ...                                                ...
1599995         4                      woke school best feeling ever
1599996         4            thewdbcom cool hear old walt interviews
1599997         4                    ready mojo makeover ask details
1599998         4  happy th birthday boo alll time tupac amaru sh...
1599999         4                                              happy

[100 rows x 2 columns]


In [None]:
vectorizer = TfidfVectorizer(max_features=5000)

X = data['text']
y = data['polarity'].apply(lambda x: 1 if x == 4 else (0 if x == 0 else 2))  # Mapping: 1 = positive, 0 = negative, 2 = neutral

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X = vectorizer.fit_transform(data['text'])

X_train_tfidf = vectorizer.fit_transform(X_train).toarray()

X_test_tfidf = vectorizer.transform(X_test).toarray()

y_train = np.array(y_train)
y_test = np.array(y_test)


In [None]:
print(X_train.head())

In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical