In [2]:
import re
from string import punctuation
from nltk.corpus import stopwords
# if stopwords is not downloaded, please uncomment the 2 lines below
#import nltk
#nltk.dowload('stopwords')

# load spacys vocab
import spacy
# if en_core_web_lg is not found then uncomment the line below
# !python -m spacy download en_core_web_lg

nlp = spacy.load('en_core_web_lg')


In [4]:
import pandas as pd

train = pd.read_csv('dataset/Corona_NLP_train.csv', encoding='latin1')
test = pd.read_csv('dataset/Corona_NLP_test.csv', encoding='latin1')


In [5]:
# dropping some unnecessary columns
train.drop(["UserName", "ScreenName", "Location", "TweetAt"], axis=1, inplace=True)

In [32]:
train.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [6]:
def preprocess(text):
    doc = nlp(text)
    result = []
    for token in doc:
        if not token.is_stop and token.is_alpha:
            result.append(token.lemma_)
    return ' '.join([w for w in result])

def process_text(text):
  text = str(text) #Convert string to str
  #Lowers the string
  text = text.lower()
  #Removes the full url
  url_remove = re.compile(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
  text = re.sub(url_remove,' ',text)
  #Removes the punctuation
  text = ''.join([string for string in text if string not in punctuation and not string.isdigit()])
  #Removes any more special characters
  special_character = re.compile(r'[^a-zA-Z]')
  text = re.sub(special_character,' ', text)
  text = text.strip() #Strip white spaces
  text = text.split(' ')
  text = ' '.join([string for string in text if string not in stopwords.words('english')])#Removing all stop words
  return text

train['OriginalTweet'].apply(process_text)

0                          menyrbie philgahan chrisitv    
1        advice talk neighbours family exchange phone n...
2        coronavirus australia woolworths give elderly ...
3        food stock one empty      please dont panic en...
4        ready go supermarket covid outbreak      im pa...
                               ...                        
41152    airline pilots offering stock supermarket shel...
41153    response complaint provided citing covid relat...
41154    know  getting tough kameronwilds  rationing to...
41155    wrong smell hand sanitizer starting turn      ...
41156    tartiicat well newused rift going  amazon rn a...
Name: OriginalTweet, Length: 41157, dtype: object

In [16]:
train["Sentiment"].value_counts()

@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8
Pa


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

pipe = Pipeline([('count_vectorizer', CountVectorizer(binary=False)), ('multi_naive_bayes', MultinomialNB())])
pipe.fit(train['OriginalTweet'], train['Sentiment'])
predicted = pipe.predict(test['OriginalTweet'])
print(classification_report(test['Sentiment'], predicted))

                    precision    recall  f1-score   support

Extremely Negative       0.66      0.17      0.28       592
Extremely Positive       0.75      0.22      0.34       599
          Negative       0.42      0.56      0.48      1041
           Neutral       0.70      0.20      0.31       619
          Positive       0.36      0.72      0.48       947

          accuracy                           0.43      3798
         macro avg       0.58      0.38      0.38      3798
      weighted avg       0.54      0.43      0.40      3798

