In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Twitter_Data.csv")

In [3]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
df['category'] = df['category'].map({0: 'Neutral', -1: 'Negative', 1: 'Positive'})

checking for null values in each column

In [5]:
print(df.isnull().sum())

clean_text    4
category      7
dtype: int64


droping rows with null values

In [6]:
df.dropna(inplace=True)

In [7]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
import re
import nltk
import string
from nltk.corpus import stopwords

downloading stopwords

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mamta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

removing special characters

In [10]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

converting to lowercase

In [11]:
df['clean_text'] = df['clean_text'].apply(lambda x: x.lower())

removing punctuation

In [12]:
translator = str.maketrans('', '', string.punctuation)
df['clean_text'] = df['clean_text'].apply(lambda x: x.translate(translator))

removing stopwords

In [13]:
stop_words = set(stopwords.words('english'))
df['clean_text'] = df['clean_text'].apply(lambda x: [word for word in x if word not in stop_words])

In [14]:
df['sentence_length'] = df['clean_text'].apply(lambda x: len(x))

In [15]:
df.head()

Unnamed: 0,clean_text,category,sentence_length
0,"[w, h, e, n, , , p, r, e, , n, u, , g, v, ...",Negative,123
1,"[l, k, , l, l, , h, e, , n, n, e, n, e, , ...",Neutral,42
2,"[w, h, , , j, u, , , v, e, , f, r, , , ...",Positive,67
3,"[k, n, g, , h, , u, p, p, r, e, r, , p, r, ...",Positive,129
4,"[n, w, e, r, , w, h, , n, g, , h, e, e, , ...",Positive,47


Spliting the data into X (features) and y (target)

In [16]:
X = df.drop(['category'], axis=1)
y = df['category']

In [17]:
import tensorflow as tf

building vocabulary and one-hot encoding

In [18]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X['clean_text'])
X_one_hot = tokenizer.texts_to_matrix(X['clean_text'], mode='binary')

pad sequences with zeros at the front

In [19]:
max_length = max([len(seq) for seq in X_one_hot])
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_one_hot, maxlen=max_length, padding='pre')

building model

In [20]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(32, input_shape=(max_length, len(tokenizer.word_index)+1)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

compiling the model

In [21]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
y_dummy = pd.get_dummies(y)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_dummy, test_size=0.2)

Reshaping input data to have shape (batch_size, timesteps, features)

In [24]:
X_train = tf.expand_dims(X_train, axis=2)
X_test = tf.expand_dims(X_test, axis=2)

In [None]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

In [None]:
predictions = model.predict(X_test)
predictions = [round(pred) for pred in predictions]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))