<a href="https://colab.research.google.com/github/JustinPark845/Natural-Language-Processing-Final-Project/blob/main/All.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#**SPLIT DATA**

In [None]:
import ast
f=open("/content/drive/MyDrive/nlp_project/text_to_label.txt", "r")
text_to_label = ast.literal_eval(f.read())
f.close()

In [None]:
text_data_train = []
text_label_train = []

text_data_test = []
text_label_test = []

for diction in text_to_label[:round(len(text_to_label)*.80)]:
  text_data_train.append(diction["text"])
  text_label_train.append(diction["label"])
for diction in text_to_label[round(len(text_to_label)*.80):]:
  text_data_test.append(diction["text"])
  text_label_test.append(diction["label"])

In [None]:
# Create csv
import csv  

header = ['text','label']

with open('/content/drive/MyDrive/nlp_project/text_to_label_train.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    for i in text_to_label[:round(len(text_to_label)*.80)]:
      if i["label"] < 0:
        writer.writerow([i["text"],0])
      else:
        writer.writerow([i["text"],i["label"]])

with open('/content/drive/MyDrive/nlp_project/text_to_label_test.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    for i in text_to_label[round(len(text_to_label)*.80):]:
      if i["label"] < 0:
        writer.writerow([i["text"],0])
      else:
        writer.writerow([i["text"],i["label"]])

#**CLASSIFIERS**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(text_data_train + text_data_test)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
import numpy as np

# for later testing
nptest = X_train_tfidf.toarray()
npdata = X_train_tfidf.toarray()[:round(len(text_to_label)*.80)]
nptarget = text_label_train

In [None]:
from sklearn.model_selection import cross_validate

# These are the scoring metrics we will consider
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(npdata,nptarget)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

knn = KNeighborsClassifier(n_neighbors=5)
dtree = tree.DecisionTreeClassifier()
lsvc = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))

knn.fit(npdata,nptarget)
dtree.fit(npdata,nptarget)
lsvc.fit(npdata,nptarget)

#**BERT**

In [None]:
!pip install transformers

In [None]:
from transformers import BertConfig, BertModel, AutoTokenizer

model = BertModel.from_pretrained('/content/drive/MyDrive/nlp_project')

In [None]:
import torch 
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

#**SENTIMENT**

In [None]:
#Download all three sentiments
!pip install flair
!pip install spacy==3.4
!pip install spacytextblob
!pip install vaderSentiment
!pip install pysentiment2

In [None]:
import flair
import spacy
from flair.models import TextClassifier
from flair.data import Sentence
from spacytextblob.spacytextblob import SpacyTextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pysentiment2 as ps

#Load classifier for English
flair_sentiment = TextClassifier.load('en-sentiment')
spacy_sentiment = spacy.load('en_core_web_sm')
spacy_sentiment.add_pipe('spacytextblob')
hiv4 = ps.HIV4()

#Combined Function to output array of results in order of [Flair, Spacy, Vader]
def combined_FSV(n):
  result = []
  
  #Flair
  s = flair.data.Sentence(n)
  flair_sentiment.predict(s)
  total_sentiment = s.labels[0]
  assert total_sentiment.value in ['POSITIVE', 'NEGATIVE']
  val_f = 1 if total_sentiment.value == 'POSITIVE' else -1
  
  #Spacy
  text = spacy_sentiment(n)
  polarity_score = text._.polarity
  if polarity_score > 0:
    val_s = 1
  else:
    val_s = -1
  
  #Vader
  vader_sent = SentimentIntensityAnalyzer()
  vader_result = vader_sent.polarity_scores(n)
  if vader_result['compound'] > 0.05:
    val_v = 1
  else:
    val_v = -1
  
  #Pysentiment2
  tokens = hiv4.tokenize(n)
  val_py = -1
  score = hiv4.get_score(tokens)
  if score['Positive'] >= score['Negative']:
    val_py = 1
  
  result.append(val_f)
  result.append(val_s)
  result.append(val_v)
  result.append(val_py)
  return result

#**PUT IT TOGETHER**

In [None]:
all = []

temp = []
for i in range(round(len(text_to_label)*.80),len(text_to_label)-1):
  # Sentiment
  fsv = combined_FSV(text_to_label[i]["text"])
  temp.extend(fsv)

  # Bert
  input_ids = torch.tensor(tokenizer.encode(text_to_label[i]["text"], add_special_tokens=True, truncation = True)).unsqueeze(0)  # Batch size 1
  labels = torch.tensor([1]).unsqueeze(0)
  outputs = model(input_ids,labels)
  loss, logits = outputs[:2]
  predictions = np.argmax(logits.detach().numpy(), axis=-1)
  temp.append(predictions[0])

  # Classifiers
  classifiers = []
  classifiers.append(gnb.predict([nptest[i]])[0])
  classifiers.append(knn.predict([nptest[i]])[0])
  classifiers.append(dtree.predict([nptest[i]])[0])
  classifiers.append(lsvc.predict([nptest[i]])[0])
  temp.extend(classifiers)
  
  all.append((temp,text_to_label[i]["label"]))
  temp = []

In [None]:
f=open("/content/drive/MyDrive/nlp_project/all.txt", "w")
f.write(str(all))
f.close()