In [1]:
# Jovian Commit Essentials
# Please retain and execute this cell without modifying the contents for `jovian.commit` to work
!pip install jovian --upgrade -q
import jovian
jovian.set_project('sentiment-analysis')
jovian.set_colab_id('1bHnlUQNGSGgVwaNMH8jxgluwL5sXcdtt')

# sentiment-analysis

Use the "Run" button to execute the code.

In [2]:
!pip install jovian --upgrade --quiet

In [3]:
import jovian

In [50]:
# Execute this to save new versions of the notebook
jovian.commit(project="sentiment-analysis")

[jovian] Detected Colab notebook...[0m
[jovian] Uploading colab notebook to Jovian...[0m
Committed successfully! https://jovian.ai/infi-09/sentiment-analysis


'https://jovian.ai/infi-09/sentiment-analysis'

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
#os.environ['KAGGLE_CONFIG_DIR'] = '/content/gdrive/MyDrive/Datasets'
#%cd /content/gdrive/MyDrive/Datasets
#!kaggle datasets download -d arkhoshghalb/twitter-sentiment-analysis-hatred-speech
#!mkdir sentiment-anlaysis
#!mv twitter-sentiment-analysis-hatred-speech.zip sentiment-anlaysis
#%cd /content/gdrive/MyDrive/Datasets/sentiment-anlaysis
#!unzip twitter-sentiment-analysis-hatred-speech.zip

In [43]:
import re
import os
import nltk
import numpy as np
import pandas as pd
from textblob import TextBlob
from string import punctuation
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
train = pd.read_csv('/content/gdrive/MyDrive/Datasets/sentiment-anlaysis/train.csv')

In [9]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [10]:
train.drop('id', axis=1, inplace=True)
train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [11]:
len(train), len(test)

(31962, 17197)

# Calculating the length of stopwords in each tweets and removing it

In [12]:
stop = stopwords.words('english')

In [13]:
def stopword(data):
  data['stopwords'] = data['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
  print(data[['tweet', 'stopwords']].head())

In [14]:
stopword(train)

                                               tweet  stopwords
0   @user when a father is dysfunctional and is s...         10
1  @user @user thanks for #lyft credit i can't us...          5
2                                bihday your majesty          1
3  #model   i love u take with u all the time in ...          5
4             factsguide: society now    #motivation          1


In [15]:
def stopword_removal(data):
  data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
  print(data['tweet'].head())

In [16]:
stopword_removal(train)

0    @user father dysfunctional selfish drags kids ...
1    @user @user thanks #lyft credit can't use caus...
2                                       bihday majesty
3    #model love u take u time urð±!!! ððð...
4                      factsguide: society #motivation
Name: tweet, dtype: object


# Removing Punctuation

In [17]:
def punctuation(data):
  data['tweet'] = data['tweet'].str.replace(r'[^\w\s]', '')
  print(data['tweet'].head())

In [18]:
punctuation(train)

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause of...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object


# Removing Most Frequent Words in the Tweets

In [19]:
most_frequency = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
most_frequency = list(most_frequency.index)
most_frequency

['user', 'love', 'ð', 'day', 'â', 'happy', 'amp', 'im', 'u', 'time']

In [22]:
def remove_most_frequent(data):
  data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in most_frequency))
  print(data['tweet'].head())

In [23]:
remove_most_frequent(train)

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause offer wheelc...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object


# Removing Rare words in the Tweets

In [24]:
rare_words = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
rare_words = list(rare_words.index)
rare_words

['divided',
 'greedyunempathic',
 'dadshopefully',
 'lovelypeople',
 'thingies',
 'moodðð',
 'mitchell',
 'piervld',
 'accra',
 'bownaker']

In [25]:
def remove_rare_words(data):
  data['tweet'] = data['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in rare_words))
  print(data['tweet'].head())

In [26]:
remove_rare_words(train)

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause offer wheelc...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object


# Stemming

In [27]:
stemmer = PorterStemmer()

In [28]:
corpus=[]

for i in range(len(train)):
    review = re.sub("[^a-zA-Z]"," ",str(train["tweet"][i]))
    review = review.lower() 
    review = review.split()
    review = [stemmer.stem(word) for word in review ]
    review = " ".join(review)
    corpus.append(review)

In [29]:
corpus[:5]

['father dysfunct selfish drag kid dysfunct run',
 'thank lyft credit cant use caus offer wheelchair van pdx disapoint getthank',
 'bihday majesti',
 'model take ur',
 'factsguid societi motiv']

In [33]:
len(corpus), len(train['label'])

(31962, 31962)

# TFIDF Vectorizer

In [34]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus[:10000]).toarray()
y = train['label'][:10000]

# Split our data in order to train the model

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state=99)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7000, 15699), (3000, 15699), (7000,), (3000,))

# Naive Bayes Model

In [41]:
naive = MultinomialNB()
naive.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
y_preds = naive.predict(X_test)

In [47]:
confusion_matrix(y_preds, y_test)

array([[2790,  202],
       [   0,    8]])

In [48]:
acc_naive = accuracy_score(y_preds, y_test)
acc_naive

0.9326666666666666

# TextBlob

In [49]:
polarity = 0
positive = 0
negative = 0
neutral = 0

for review in corpus:
  analysis = TextBlob(review)
  tweet_polarity = analysis.polarity
  if tweet_polarity > 0:
    positive +=1
  elif tweet_polarity < 0:
    negative +=1
  else:
    neutral +=1  
  polarity += tweet_polarity

print('The Amount of Positive Tweets: ', positive) 
print('The Amount Of Negative Tweets: ',negative) 
print('The Amount Of Neutral Tweets: ',neutral)
print('Polarity: ', polarity)

The Amount of Positive Tweets:  10251
The Amount Of Negative Tweets:  4093
The Amount Of Neutral Tweets:  17618
Polarity:  2602.815369745995
