In [4]:
# importing libraries
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')

#### loading training and test dataset

In [5]:
train = pd.read_csv(r'./Corona_NLP_train.csv',encoding='latin-1')
test = pd.read_csv(r'./Corona_NLP_test.csv',encoding='latin-1')


In [6]:
print('Training dataset rows & col :',train.shape)  # checking shape of our data
print('Test dataset rows & col     :',test.shape)  # checking shape of our data


Training dataset rows & col : (41157, 6)
Test dataset rows & col     : (3798, 6)


In [7]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [8]:
test

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
3793,3794,48746,Israel ??,16-03-2020,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,3795,48747,"Farmington, NM",16-03-2020,Did you panic buy a lot of non-perishable item...,Negative
3795,3796,48748,"Haverford, PA",16-03-2020,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,3797,48749,,16-03-2020,Gov need to do somethings instead of biar je r...,Extremely Negative


In [9]:
train.isna().sum() # checking for null values in training dataset

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [10]:
test.isna().sum() # checking for null values in test dataset

UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
dtype: int64

- for sentiment analysis we don't need columns like username, screename, location and tweet at
- as username here is index only & screen name too is random increment of integers
- also location having too many null values

In [11]:
train = train.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'])
test = test.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'])

In [12]:
train['Sentiment'].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [13]:
train['Sentiment'].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [14]:
test['Sentiment'].unique()

array(['Extremely Negative', 'Positive', 'Extremely Positive', 'Negative',
       'Neutral'], dtype=object)

In [15]:
test['Sentiment'].value_counts()

Sentiment
Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: count, dtype: int64

In [16]:
# function for clearing the tweets phrases in tweet columns
def cleaning(data):
    df = data.copy()
    df.columns = map(str.lower, df.columns)

    def remove_web_urls(text): return re.sub(r'https?://\S+', ' ', text)
    df['originaltweet'] = df['originaltweet'].apply(remove_web_urls)

    def remove_tags(text): return re.sub(r'@\w*', ' ' , text)
    df['originaltweet'] = df['originaltweet'].apply(remove_tags)

    def remove_hashtags(text): return re.sub(r'#\w*', ' ' , text)
    df['originaltweet'] = df['originaltweet'].apply(remove_hashtags)

    def remove_apostrophe(text): return re.sub(r"'s\b", "", text)
    df['originaltweet'] = df['originaltweet'].apply(remove_apostrophe)

    def remove_special_chars(text): return re.sub(r"[^a-zA-Z0-9\s]", ' ', text)
    df['originaltweet'] = df['originaltweet'].apply(remove_special_chars)

    def remove_number(text): return re.sub(r'[\d]', ' ', text)
    df['originaltweet'] = df['originaltweet'].apply(remove_number)

    df['originaltweet'] = df['originaltweet'].str.lower()
    return df

cleaned_train = cleaning(train)
cleaned_test = cleaning(test)

In [17]:
cleaned_train, cleaned_test

(                                           originaltweet           sentiment
 0                                            and   and               Neutral
 1      advice talk to your neighbours family to excha...            Positive
 2      coronavirus australia  woolworths to give elde...            Positive
 3      my food stock is not the only one which is emp...            Positive
 4      me  ready to go at supermarket during the   ou...  Extremely Negative
 ...                                                  ...                 ...
 41152  airline pilots offering to stock supermarket s...             Neutral
 41153  response to complaint not provided citing covi...  Extremely Negative
 41154  you know it  s getting tough when    is ration...            Positive
 41155  is it wrong that the smell of hand sanitizer i...             Neutral
 41156    well new used rift s are going for         o...            Negative
 
 [41157 rows x 2 columns],
                                   

In [18]:
cleaned_train['sentiment'] = cleaned_train['sentiment'].str.replace('Extremely Positive', 'Positive') # replacing all Extremely Positive to positive
cleaned_train['sentiment'] = cleaned_train['sentiment'].str.replace('Extremely Negative', 'Negative') # replacing all Extremely Negative to negative

cleaned_test['sentiment'] = cleaned_test['sentiment'].str.replace('Extremely Positive', 'Positive') # replacing all Extremely Positive to positive
cleaned_test['sentiment'] = cleaned_test['sentiment'].str.replace('Extremely Negative', 'Negative') # replacing all Extremely Negative to negative

In [19]:
# importing sklearn required libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [20]:
X = cleaned_train['originaltweet'] # learing variable
y = cleaned_train['sentiment'] # outcome varibale
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # splitting and forming variables for training and testing

In [21]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape # shape of all varibales

((32925,), (8232,), (32925,), (8232,))

In [22]:
vectorizer = CountVectorizer() # intializing function
X_train_vectorized = vectorizer.fit_transform(X_train) # fitting and transforming train dataset
X_val_vectorized = vectorizer.transform(X_val)  # fitting and transforming test dataset

In [23]:
model = MultinomialNB() # intializing naive bayes
model.fit(X_train_vectorized, y_train) # fitting model with traing dataset
y_pred = model.predict(X_val_vectorized) # predicting and assignig to a function

accuracy = accuracy_score(y_val, y_pred) # checking accuracy of the function
print("Accuracy:", accuracy)

Accuracy: 0.6922983479105929


In [24]:
# importing required tensorflow libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
tokenizer = Tokenizer() # initializing tokenizer
tokenizer.fit_on_texts(X_train)  # fitting model with traing dataset

# creating variables for tokenizer methods
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_val_sequences = tokenizer.texts_to_sequences(X_val)

# setting parameters for model
max_sequence_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_sequence_length)

# initializing labelencoder for transforming classification categorical columns
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# initializing Sequential model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(128))
model.add(Dense(5, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# setting tensorflow strategy
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    mirrored_model = model

# training the modal
mirrored_model.fit(X_train_padded, y_train_encoded, validation_data=(X_val_padded, y_val_encoded), epochs=5, batch_size=32)

# predicting from trained modal
y_pred_probs = mirrored_model.predict(X_val_padded)
y_pred_encoded = tf.argmax(y_pred_probs, axis=1).numpy()

# scaling back to original categorical inputs
y_pred_labels = label_encoder.inverse_transform(y_pred_encoded)

# checking accuracy of the modal
accuracy = accuracy_score(y_val, y_pred_labels)
print("Accuracy:", accuracy)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.8797376093294461
