# Sentiment Analysis

Goal of sentiment analysis is to identify text is positive or negative

In [None]:
!pip install spacy --upgrade

In [2]:
!python -m spacy download en_core_web_sm

In [None]:
!pip install langdetect

## Imports

In [None]:
import spacy
import en_core_web_sm
import pandas as pd
import seaborn as sns
import numpy as np
import re
import random
import csv
import nltk

from langdetect import detect

from google.colab import drive

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

# drive.mount('/content/drive')

# Get and process Twitter Data

In [86]:
train_data = pd.read_csv('/content/drive/MyDrive/train.csv', quotechar='"',engine="python",sep=',',on_bad_lines='skip',
                            names = ['sentiment', 'id', 'date', 'query', 'user', 'text'], encoding='latin1')


In [None]:
train_data

In [None]:
# train_data['sentiment'].unique()
np.unique(train_data['sentiment'], return_counts=True)

In [None]:
sns.countplot(x=train_data['sentiment']);


In [None]:
train_data = train_data.drop(['id', 'date', 'query', 'user'], axis=1)
train_data

## Train and Test

Goal here is to -
* sperate similarity cloumn and sentiment column
* to save processing choose only 30% of the data
* split this data in training data and testing data (80:20)

In [None]:
# get only the test
X = train_data.iloc[:, 1].values

Y = train_data.iloc[:, 0].values

print(X)

print(Y)

In [None]:
from sklearn.model_selection import train_test_split

# select only 3% of data
x, _, y ,_ = train_test_split(X,Y,test_size=0.97)

print('tweets',x.shape)
print('sentiment',y.shape)

# split data to training and testing data for ML
# this will split - 80% training data and 20% testing data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

print('tweets training',x_train.shape)
print('tweets testing',x_test.shape)
print('sentiment training',y_train.shape)
print('sentiment testing',y_test.shape)

# Preprocess Data set

Goal here is
* to preprocess data by removing stop words, puntuations, numbers, spaces and single letter
* remove '@username'
* remove links from data set

In [None]:
nlp = spacy.load('en_core_web_sm')

# nlp

def preprocessing(sentence: str) -> list:
  sentence = sentence.lower()
  sentence = re.sub(r"@[A-Za-z0-9]+", ' ', sentence)
  sentence = re.sub(r"https?://[A-Za-z0-9./]+", ' ', sentence)
  sentence = sentence.replace('.', '')
  tokens = []
  tokens = [token.text for token in nlp(sentence) if not (token.is_stop or token.like_num or token.is_punct or token.is_space or len(token) == 1)]
  tokens = ' '.join([element for element in tokens])

  return tokens


preprocessing("@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  2 You shoulda got David Carr of Third Day to do it. ;D")

In [None]:
# clean training data
x_train_cleaned = []

for tweet in x_train:
  x_train_cleaned.append(preprocessing(tweet))

# check for cleaned data
# for _ in range(10):
#   print(x_train_cleaned[random.randint(0, len(x_train_cleaned) - 1)])

print('\n',len(x_train_cleaned))

In [None]:
# clean test data
x_test_cleaned = []

for tweet in x_test:
  x_test_cleaned.append(preprocessing(tweet))

print('\n',len(x_test_cleaned))

# Word Cloud

In [None]:
#  concatinate all the sentences into a string

texts = ''
for text in x_train_cleaned:
  texts+=' ' + text

from wordcloud import WordCloud
import matplotlib.pyplot as plt
cloud = WordCloud()
cloud = cloud.generate(texts)
plt.figure(figsize=(15,15))
plt.imshow(cloud)
plt.axis('off');

# Detecting languages

* Goal here is to detec the languages in the data set
* it helps with removing the stop words and clean data properly

In [None]:
!pip install langdetect

In [None]:
from langdetect import detect

detect("This is english text")

In [None]:
languages = []

for text in x_test_cleaned:
  if text != '':
    languages.append(detect(text))

np.unique(languages, return_counts=True)

# Sentiment analysis with NLTK

with nltk we dont have to train our own algorithm

Advantage -
* very simple to use and out-of-the-box

In [None]:
import nltk
nltk.download('vader_lexicon')


In [124]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk_classifier = SentimentIntensityAnalyzer()

s = nltk_classifier.polarity_scores("I love this food")
print(s)

s = nltk_classifier.polarity_scores("I hate this food")
print(s)

s = nltk_classifier.polarity_scores("I have this food")
print(s)

In [None]:
print(x_train_cleaned[0])
print("\n")
nltk_classifier.polarity_scores(x_train_cleaned[0])

In [None]:
for sentence in x_test:
  print(nltk_classifier.polarity_scores(sentence), ' - ', sentence)

# Sentiment analysis with TF-IDF

consider the following banking data -

training data -

|Credit History|Debts|Properties|Anual Income| Risk |
|--------------|-----|----------|------------|------|
|Bad|High|No|<15000|High|
|Good|Low|Yes|>=15000 and <=40000|Low|

testing data -

|Credit History|Debts|Properties|Anual Income|
|--------------|-----|----------|------------|
|Bad|High|Yes|<15000|
|Moderate|Low|No|>=15000 and <=40000|

Classification -
* data is cleaned
* the data is classified into training data and testing data
* the algorithm is trained using training data
* when testing data is provided it needs to produce similar results

Decission tree -
* Goal is to analyse the training data and create a decission tree
* The attributes are the nodes of the tree, i.e Credit History, Debts
* leaf node indicate the classes, i.e low,high.moderate riks
* the algorithm needs to perform mathematical calculations to generate this tree


In [None]:
# checking data

print(x_train_cleaned[0:5])
print("\n")
print(y_train)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorize the training data set
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train_cleaned)

print(x_train_tfidf.toarray().shape)

# do not run in collab
# print(vectorizer.get_feature_names_out())

print(len(vectorizer.get_feature_names_out()))

### Lemetization of training data
* apply lemmetization to reduce number of columns in the vector
* tf-idf vector columns are words/token in a sentence

In [None]:
# lemmetize
def preprocessing_lemma(s: str):
  tokens=[]

  for token in nlp(s):
    tokens.append(token.lemma_)

  tokens = ' '.join([ele for ele in tokens])

  return tokens

# check function
preprocessing_lemma('learn learned learning')

In [160]:
# lemetize the training data
# x_train_cleaned_lemma = []

# for tweet in x_train_cleaned:
#   x_train_cleaned_lemma.append(preprocessing_lemma(tweet))

x_train_cleaned_lemma = [preprocessing_lemma(tweet) for tweet in x_train_cleaned]


In [None]:
# vectorize the trained data
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train_cleaned_lemma)

#  cloumn is reduced
print(x_train_tfidf.shape)

In [150]:
# lemetize the testing data
x_test_cleaned_lemma = [preprocessing_lemma(tweet) for tweet in x_test_cleaned]
# # vectorize the test data
# vectorizer = TfidfVectorizer()
# x_test_tfidf = vectorizer.fit_transform(x_test_cleaned_lemma)

# #  cloumn is reduced
# print(x_test_tfidf.shape)

In [None]:
x_test_tfidf = vectorizer.transform(x_test_cleaned_lemma)
#  cloumn is reduced
print(x_test_tfidf.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
# train the classifier
classifier.fit(x_train_tfidf, y_train)

In [165]:
predictions = classifier.predict(x_test_tfidf)

In [None]:
predictions


In [None]:
y_test

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_test, predictions))
print('\n')
cm = confusion_matrix(y_test, predictions)
print(cm)
# first row = 0
# second row = 4

# [0][0] = correct negative tweets
# [0][1] = wrong negative tweets classified
# [1][1] = correct positive tweets
# [1][0] = wrong positive tweets classified

In [None]:
# acuracy calculation
(3378 + 3109) / 9600

In [173]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.67      0.70      0.68      4796
           4       0.69      0.65      0.67      4804

    accuracy                           0.68      9600
   macro avg       0.68      0.68      0.68      9600
weighted avg       0.68      0.68      0.68      9600



# TF-IDF Sentiment analyzer Class

Consolidation of the above tf-idf into a working Python Class

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

'''
  Cleans and classified data in training and testing data
'''
class DataProcessor():

  def __init__(self):
    self.vectorizer = TfidfVectorizer()

  def preprocessing(self, sentence: str) -> list:
    sentence = sentence.lower()
    sentence = re.sub(r"@[A-Za-z0-9]+", ' ', sentence)
    sentence = re.sub(r"https?://[A-Za-z0-9./]+", ' ', sentence)
    sentence = sentence.replace('.', '')
    tokens = []
    tokens = [token.text for token in nlp(sentence) if not (token.is_stop or token.like_num or token.is_punct or token.is_space or len(token) == 1)]
    tokens = ' '.join([element for element in tokens])

    return tokens

  def preprocessing_lemma(s: str):
    tokens=[]

    for token in nlp(s):
      tokens.append(token.lemma_)

    tokens = ' '.join([ele for ele in tokens])

    return tokens


  def clean(self, data: list) -> list:
    """
      only cleans provided data
    """
    cleaned = []

    for tweet in data:
      cleaned.append(preprocessing(tweet))

    print('pre cleaned {} \n'.format(len(data)))
    print('cleaned',len(cleaned))

    return cleaned


    def clean_lemmetize(self,data: list) -> list:
      """
        cleans and lemmetize the data
      """
      x_cleaned = self.clean(data)
      x_cleaned_lemma = []

      for tweet in x_cleaned:
        x_cleaned_lemma.append(preprocessing_lemma(tweet))

    def vectorize(self, cleand_data: list) -> list:
     return self.vectorizer.fit_transform(x_train_cleaned_lemmatized)



In [None]:
# classify and process data
processor = DataProcessor()

X = train_data.iloc[:, 1].values

Y = train_data.iloc[:, 0].values

print(X)

print(Y)

x, _, y ,_ = train_test_split(X,Y,test_size=0.97)

print('tweets',x.shape)
print('sentiment',y.shape)

# split data to training and testing data for ML
# this will split - 80% training data and 20% testing data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

x_train_cleaned_lemmatized = processor.clean_lemmetize(x_train)
x_test_cleaned_lemmatized = processor.clean_lemmetize(x_test)

# vectorizer = TfidfVectorizer()
x_train_tfidf = processor.vectorize(x_train_cleaned_lemmatized)
x_test_tfidf =  processor.vectorize(x_test_cleaned_lemmatized)

# make sure both train and test has same column count
print(x_test_tfidf.shape)
print('\n')
print(x_test_tfidf.shape)

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

classifier = DecisionTreeClassifier()
# train the classifier
classifier.fit(x_train_tfidf, y_train)
# predict
predictions = classifier.predict(x_test_tfidf)

# get classification report
print(classification_report(y_test, predictions))

# Sentiment analysis with spaCy

- Documentation: https://spacy.io/usage/training

In [None]:
# [0][0] = text
# [0][1] = entites
# specify format for spacy to use
example = [["this is a positive text", {"POSITIVE": True, "NEGATIVE": False}],
           ["this is a negative text", {"POSITIVE": False, "NEGATIVE": True}]]

x_train_spacy = []
for text, sentiment in zip(x_train_cleaned_lemma, y_train):
  #print(text, sentiment)
  if sentiment == 4:
    dic = ({'POSITIVE': True, 'NEGATIVE': False})
  elif sentiment == 0:
    dic = ({'POSITIVE': False, 'NEGATIVE': True})
  x_train_spacy.append([text, dic.copy()])

x_train_spacy[0:5]

In [None]:
nlp.pipe_names

In [None]:
classifier_spacy = spacy.blank('en')
classifier_spacy.pipe_names

textcat = classifier_spacy.add_pipe('textcat')
classifier_spacy.pipe_names

textcat.add_label('POSITIVE')
textcat.add_label('NEGATIVE')

textcat.label_data

In [None]:
from spacy.training import Example

# uses a neural network to train
classifier_spacy.begin_training()

# we need to run for several ephoc to train
for ephoc in range(10):
  # shuffel to better train the neural network
  random.shuffle(x_train_spacy)

  losses = {}
  #  len of training data is 38400
  #  by dividing to 1024 we can train with 37.5 batch of data set
  for batch in spacy.util.minibatch(x_train_spacy, 1024):
    # in example [0][0] = text [0][1] = entites
    texts = [classifier_spacy.make_doc(text) for text, entities in batch]
    # cats = categories, get entity from example
    annotations = [{'cats' : entities}  for text, entities in batch]
    # create new example to provide the nural network
    examples = [Example.from_dict(doc,annotation) for doc, annotation in zip(texts, annotations)]

    classifier_spacy.update(examples, losses=losses)

  print(losses)


In [None]:
classifier_spacy.to_disk('classifier_spacy')

classifier_spacy_loaded = spacy.load('classifier_spacy')
classifier_spacy_loaded

In [None]:
classifier_spacy_loaded('i hate this food').cats

In [None]:
# create predoction in form {'POSITIVE': , 'NEGATIVE': }
predictions = []
for text in x_test_cleaned_lemma:
  prediction = classifier_spacy_loaded(text)
  predictions.append(prediction.cats)

print(predictions)

In [None]:
print(predictions)

In [None]:
# get only 0 or 4 from prediction based on positive and negative
predictions2 = []
for prediction in predictions:
  if prediction['POSITIVE'] > prediction['NEGATIVE']:
    predictions2.append(4)
  else:
    predictions2.append(0)
predictions2 = np.array(predictions2)

print(predictions2)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_test, predictions2))
print('\n')
cm = confusion_matrix(y_test, predictions2)
print(cm)

In [191]:
print(classification_report(y_test, predictions2))

              precision    recall  f1-score   support

           0       0.70      0.70      0.70      4796
           4       0.70      0.70      0.70      4804

    accuracy                           0.70      9600
   macro avg       0.70      0.70      0.70      9600
weighted avg       0.70      0.70      0.70      9600

