In [None]:
!pip install spacy --upgrade
!python -m spacy download en_core_web_sm
!pip install langdetect

# Import Dependencies

In [None]:
import spacy
import en_core_web_sm
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from google.colab import drive

In [None]:
print('TensorFlow version: ', tf.__version__)

In [None]:
drive.mount('/content/drive')

In [None]:
cols = ["sentiment", "id", "date","query", "user", "text"]
data = pd.read_csv('/content/drive/MyDrive/training.1600000.processed.noemoticon.csv', engine='python',
                   encoding='latin1', header=None,names=cols)

In [None]:
data[:5]

In [None]:
data.drop(["id", "date","query", "user"], axis=1, inplace=True)
data[:5]

In [None]:
np.unique(data['sentiment'], return_counts=True)

In [None]:
x = data.iloc[:,1].values
y = data['sentiment'].values

print(x)

print(y)

In [None]:
x = data['text']
y = data['sentiment']

x, _, y ,_ = train_test_split(x,y,test_size=0.97)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [None]:
print('tweets training',x_train.shape)
print('tweets testing',x_test.shape)
print('sentiment training',y_train.shape)
print('sentiment testing',y_test.shape)

# Preprocess

In [None]:
nlp = spacy.load('en_core_web_sm')

def preprocess(sentence: str) -> list:
  sentence = sentence.lower()
  sentence = re.sub(r"@[A-Za-z0-9]+", ' ', sentence)
  sentence = re.sub(r"https?://[A-Za-z0-9./]+", ' ', sentence)
  sentence = sentence.replace('.', '')
  tokens = []
  tokens = [token.text for token in nlp(sentence) if not (token.is_stop or token.like_num or token.is_punct or token.is_space or len(token) == 1)]
  tokens = ' '.join([element for element in tokens])

  return tokens


In [None]:
x_train_cleaned = []

for tweet in x_train:
  x_train_cleaned.append(preprocess(tweet))

print('\n',len(x_train_cleaned))


 38400


In [None]:
x_test_cleaned = []

for tweet in x_test:
  x_test_cleaned.append(preprocess(tweet))

print('\n',len(x_test_cleaned))


 9600


# Lemmatize

In [None]:
def lemmatize(s:str) -> list:
  tokens = [token.lemma_ for token in nlp(s)]

  tokens = ' '.join([ele for ele in tokens])

  return tokens

In [None]:
x_train_lemma = [lemmatize(x) for x in x_train_cleaned]

print('\n',len(x_train_lemma))


 38400


In [None]:
x_test_lemma = [lemmatize(x) for x in x_test_cleaned]

print('\n',len(x_test_lemma))



 9600


In [None]:
vectorizer = TfidfVectorizer()

x_train_tfidf = vectorizer.fit_transform(x_train_lemma)

In [None]:
print(x_train_tfidf.shape)

(38400, 30012)


In [None]:
x_test_tfidf = vectorizer.transform(x_test_lemma)
print(x_test_tfidf.shape)

(9600, 30012)


# SKLearn Multi Perceptron Classifier

https://scikit-learn.org/stable/auto_examples/neural_networks/plot_mlp_alpha.html#:~:text=Alpha%20is%20a%20parameter%20for,that%20appears%20with%20lesser%20curvatures.



- **Alpha** : is a parameter for regularization term, aka penalty term, that combats overfitting by constraining the size of the weights. Increasing alpha may fix high variance (a sign of overfitting) by encouraging smaller weights, resulting in a decision boundary plot that appears with lesser curvatures
- **hidden_layer_sizes** : This parameter allows us to set the number of layers and the number of nodes we wish to have in the Neural Network Classifier. Each element in the tuple represents the number of nodes at the ith position where i is the index of the tuple. Thus the length of tuple denotes the total number of hidden layers in the network.
- **max_iter**: It denotes the number of epochs.
- **activation**: The activation function for the hidden layers.
- **solver**: This parameter specifies the algorithm for weight optimization across the nodes.
- **random_state**: The parameter allows to set a seed for reproducing the same results

$ Nh=Ns(α∗(Ni+No)) $

- Ni = number of input neurons.
- No = number of output neurons.
- Ns = number of samples in training data set.
- α = an arbitrary scaling factor usually 2-10

In [None]:
from sklearn.neural_network import MLPClassifier

classifier = MLPClassifier(alpha=1e-5,
                           hidden_layer_sizes=(7,3),
                           max_iter=200,
                           activation = 'relu',
                           solver='adam',
                           random_state=1)

classifier.fit(x_train_tfidf,y_train)

In [None]:
import matplotlib.pyplot as plt

plt.plot(classifier.loss_curve_)
plt.show()

In [None]:
x_test[17:18]

In [None]:
classifier.predict(x_test_tfidf[17:18])

# Confusion Matrix

In [None]:
predictions = classifier.predict(x_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_test, predictions))
print('\n')
cm = confusion_matrix(y_test, predictions)
print(cm)

0.6816666666666666


[[3382 1459]
 [1597 3162]]


In [None]:

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.68      0.70      0.69      4841
           4       0.68      0.66      0.67      4759

    accuracy                           0.68      9600
   macro avg       0.68      0.68      0.68      9600
weighted avg       0.68      0.68      0.68      9600



# Spacy

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
example = [["this is a positive text", {"POSITIVE": True, "NEGATIVE": False}],
           ["this is a negative text", {"POSITIVE": False, "NEGATIVE": True}]]

x_train_spacy = []
for text, sentiment in zip(x_train_lemma, y_train):
  #print(text, sentiment)
  if sentiment == 4:
    dic = ({'POSITIVE': True, 'NEGATIVE': False})
  elif sentiment == 0:
    dic = ({'POSITIVE': False, 'NEGATIVE': True})
  x_train_spacy.append([text, dic.copy()])

x_train_spacy[0:5]

In [None]:
classifier_spacy = spacy.blank('en')
# classifier_spacy.pipe_names

textcat = classifier_spacy.add_pipe('textcat')
# classifier_spacy.pipe_names

textcat.add_label('POSITIVE')
textcat.add_label('NEGATIVE')

textcat.label_data

('POSITIVE', 'NEGATIVE')

In [None]:
from spacy.training import Example
import random

# uses a neural network to train
classifier_spacy.begin_training()

for ephoc in range(20):
  random.shuffle(x_train_spacy)

  losses = {}
  for batch in spacy.util.minibatch(x_train_spacy, 1024):
    # in example [0][0] = text [0][1] = entites
    texts = [classifier_spacy.make_doc(text) for text, entities in batch]
    # cats = categories, get entity from example
    annotations = [{'cats' : entities}  for text, entities in batch]
    # create new example to provide the nural network
    examples = [Example.from_dict(doc,annotation) for doc, annotation in zip(texts, annotations)]

    classifier_spacy.update(examples, losses=losses)

  print(losses)

In [None]:
predictions = []
for text in x_test_lemma:
  prediction = classifier_spacy(text)
  predictions.append(prediction.cats)

print(predictions)

In [None]:
predictions2 = []
for prediction in predictions:
  if prediction['POSITIVE'] > prediction['NEGATIVE']:
    predictions2.append(4)
  else:
    predictions2.append(0)
predictions2 = np.array(predictions2)

print(predictions2)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_test, predictions2))
print('\n')
cm = confusion_matrix(y_test, predictions2)
print(cm)


In [None]:
print(classification_report(y_test, predictions2))

              precision    recall  f1-score   support

           0       0.70      0.69      0.69      4841
           4       0.69      0.69      0.69      4759

    accuracy                           0.69      9600
   macro avg       0.69      0.69      0.69      9600
weighted avg       0.69      0.69      0.69      9600

