<a href="https://colab.research.google.com/github/KaranTejwani/deep-learning-practise/blob/main/Sentiment_analysis_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
dataset_path = '/content/drive/My Drive/datasets/sentiment140.csv'

In [3]:
import pandas as pd

df = pd.read_csv(dataset_path, encoding='latin-1', header=None)
# df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
display(df.head())

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
labels = df.iloc[:, 0]

In [6]:
labels

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
1599995,4
1599996,4
1599997,4
1599998,4


In [7]:
tweets = df.iloc[:, -1]

In [8]:
tweets

Unnamed: 0,5
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."
...,...
1599995,Just woke up. Having no school is the best fee...
1599996,TheWDB.com - Very cool to hear old Walt interv...
1599997,Are you ready for your MoJo Makeover? Ask me f...
1599998,Happy 38th Birthday to my boo of alll time!!! ...


In [9]:
label_counts = labels.value_counts()
label_counts

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
0,800000
4,800000


**Cleaning the tweets and removing stop words and punctuation**

In [10]:
import nltk
from nltk.corpus import stopwords

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
import string

stop_words = set(stopwords.words('english'))

def clean_tweet(tweet):
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase and tokenize by splitting
    words = tweet.lower().split()
    # Filter out stop words
    cleaned_tokens = [word for word in words if word not in stop_words]
    return ' '.join(cleaned_tokens)

In [13]:
cleaned_tweets = tweets.apply(clean_tweet)
display(cleaned_tweets.head())

Unnamed: 0,5
0,switchfoot httptwitpiccom2y1zl awww thats bumm...
1,upset cant update facebook texting might cry r...
2,kenichan dived many times ball managed save 50...
3,whole body feels itchy like fire
4,nationwideclass behaving im mad cant see


In [14]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(cleaned_tweets, labels, test_size=0.3, random_state=42, stratify=labels)

In [15]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42, stratify=train_y)

In [16]:
from tensorflow.keras.layers import TextVectorization

In [17]:
# 1. Create vectorizer
text_vectorization = TextVectorization(
    max_tokens=10000,
    output_mode="multi_hot",
    ngrams=1
)

In [18]:
text_vectorization.adapt(train_x)

In [19]:
text_vectorization.get_vocabulary()

['[UNK]',
 np.str_('im'),
 np.str_('good'),
 np.str_('day'),
 np.str_('get'),
 np.str_('like'),
 np.str_('go'),
 np.str_('dont'),
 np.str_('today'),
 np.str_('going'),
 np.str_('love'),
 np.str_('work'),
 np.str_('cant'),
 np.str_('got'),
 np.str_('time'),
 np.str_('back'),
 np.str_('lol'),
 np.str_('u'),
 np.str_('one'),
 np.str_('know'),
 np.str_('really'),
 np.str_('see'),
 np.str_('well'),
 np.str_('still'),
 np.str_('want'),
 np.str_('new'),
 np.str_('night'),
 np.str_('think'),
 np.str_('amp'),
 np.str_('home'),
 np.str_('thanks'),
 np.str_('2'),
 np.str_('oh'),
 np.str_('much'),
 np.str_('miss'),
 np.str_('need'),
 np.str_('last'),
 np.str_('morning'),
 np.str_('hope'),
 np.str_('tomorrow'),
 np.str_('great'),
 np.str_('ill'),
 np.str_('twitter'),
 np.str_('thats'),
 np.str_('haha'),
 np.str_('feel'),
 np.str_('sad'),
 np.str_('fun'),
 np.str_('wish'),
 np.str_('right'),
 np.str_('didnt'),
 np.str_('sleep'),
 np.str_('bad'),
 np.str_('would'),
 np.str_('happy'),
 np.str_('sorry'

In [None]:
x_train_vec = text_vectorization(train_x)
x_val_vec   = text_vectorization(val_x)
x_test_vec  = text_vectorization(test_x)