In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('/content/drive/MyDrive/dataset/agr_en_train.csv', names=['comment_text', 'label'])

In [4]:
df = df[df.label != 'CAG']

In [5]:
df_test = pd.read_csv('/content/drive/MyDrive/dataset/agr_en_dev.csv', names=['comment_text', 'label'])

In [6]:
df_test = df_test[df_test.label != 'CAG']

In [7]:
data_classes = [ "NAG", "OAG"]
df['label'] = df['label'].apply(data_classes.index)

In [8]:
df_test['label'] = df_test['label'].apply(data_classes.index)

In [None]:
!pip install --quiet spacy

In [9]:
import spacy
import tqdm
from typing import List, Tuple

spacy_nlp = spacy.blank("en")


def tokenize_spacy(text: str) -> List[str]:
  """Tokenize string with SpaCy. """

  tokens = spacy_nlp.tokenizer(text)
  return [str(token) for token in tokens]


def tokenize(text: str) -> List[str]:
  return tokenize_spacy(text)

In [10]:
!pip install pystemmer

Collecting pystemmer
  Downloading PyStemmer-2.2.0.1.tar.gz (303 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.0/303.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pystemmer
  Building wheel for pystemmer (setup.py) ... [?25l[?25hdone
  Created wheel for pystemmer: filename=PyStemmer-2.2.0.1-cp310-cp310-linux_x86_64.whl size=579735 sha256=f06e14e9054c09badac690c703f4712524e3c196cf83100f6fec0c4ffc7ee2b6
  Stored in directory: /root/.cache/pip/wheels/45/7d/2c/a7ebb8319e01acc5306fa1f8558bf24063d6cec2c02de330c9
Successfully built pystemmer
Installing collected packages: pystemmer
Successfully installed pystemmer-2.2.0.1


In [11]:
import Stemmer
from typing import List

def stem(tokens: List[str]) -> List[str]:

  stemmer = Stemmer.Stemmer("english")
  tokens = [tok.lower() for tok in tokens]
  return stemmer.stemWords(tokens)

In [12]:
from tqdm import tqdm

all_tokens = []
all_tokens_stemmed = []
for doc in tqdm(df['comment_text']):
  doc_tokens = tokenize(doc)
  all_tokens += doc_tokens
  all_tokens_stemmed += stem(doc_tokens)

#print("Original unique tokens: {:,}".format(len(set(all_tokens))))
#print("Stemmed  unique tokens: {:,}".format(len(set(all_tokens_stemmed))))

100%|██████████| 7759/7759 [00:05<00:00, 1457.17it/s]


In [13]:
def preprocess(text: str) -> List[str]:
  tokens = tokenize(text)
  tokens = stem(tokens)
  return tokens

In [14]:
from collections import Counter
import matplotlib.pyplot as plt

assert all_tokens
assert all_tokens_stemmed
token_counts = Counter(all_tokens_stemmed)
most_common = token_counts.most_common(50)

In [15]:
VOCAB_SIZE = 5000
MIN_COUNT = 5

vocab_list = [token for token, count in token_counts.most_common()
              if count > MIN_COUNT
              ]

vocab_list = vocab_list[:VOCAB_SIZE]

In [16]:
class Vocabulary:

  def __init__(self, tokens, unk_token="<unk>"):
    self.unk_token = unk_token
    self.unk_index = 0
    self._itos = [unk_token] + tokens
    self._stoi = {token: index for index, token in enumerate(self._itos)}

  def stoi(self, token: str) -> int:

    return self._stoi.get(token, self.unk_index)


  def itos(self, index: int) -> str:
    return self._itos[index]

  @property
  def tokens(self):
    return self._itos

  def __len__(self) -> int:
    return len(self._itos)

In [17]:
vocab = Vocabulary(vocab_list)

In [18]:
def extract_ngrams(tokens: List[str], max_n: int, min_n: int = 1) -> List[str]:

    return list(extract_ngrams_iter(tokens, max_n, min_n))


def extract_ngrams_iter(tokens: List[str], max_n: int, min_n: int = 1) -> List[str]:

    for i in range(len(tokens)):
        for j in range(min_n, max_n + 1):
            if i + j <= len(tokens):
                ngram = tokens[i : i + j]
                ngram = " ".join(ngram)
                yield ngram

In [19]:
counts = Counter()
for doc in tqdm(df['comment_text']):
  tokens = preprocess(doc)
  ngrams = extract_ngrams(tokens, 3)
  counts.update(ngrams)

ngrams_vocab = [token for token, count in counts.most_common() if count > MIN_COUNT]
ngrams_vocab = ngrams_vocab[:VOCAB_SIZE]
ngrams_vocab = Vocabulary(ngrams_vocab)

100%|██████████| 7759/7759 [00:02<00:00, 3879.36it/s]


In [20]:
def bag_of_ngrams(tokens: List[str], vocab: Vocabulary) -> List[int]:
  result = [0] * len(vocab)
  ngrams = extract_ngrams(tokens, 3)
  for ngram in ngrams:
    index = vocab.stoi(ngram)
    result[index] += 1

  return result

In [25]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def evaluate(model):
  y_hat = model.predict(X_test)
  accuracy = (y_hat == y_test).mean()


  return {
      "accuracy": accuracy,
      "precision": precision_score(y_test, y_hat,
                                           pos_label='positive',
                                           average='weighted'),
      "recall": recall_score(y_test, y_hat, pos_label='positive',
                                           average='weighted'),
      "f1": f1_score(y_test, y_hat, pos_label='positive',
                                           average='weighted'),
  }


In [21]:
import numpy as np


X_list = []
y_list = []
for doc in df['comment_text']:
  tokens = preprocess(doc)
  X_list.append(bag_of_ngrams(tokens, ngrams_vocab))
for doc in df['label']:
  y_list.append(doc)
X = np.array(X_list)
y = np.array(y_list)


In [22]:
X_list = []
y_list = []
for doc in df_test['comment_text']:
  tokens = preprocess(doc)
  X_list.append(bag_of_ngrams(tokens, ngrams_vocab))
for doc in df_test['label']:
  y_list.append(doc)
X_test = np.array(X_list)
y_test = np.array(y_list)

In [23]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=0.5, penalty="l1")
model.fit(X, y)

In [26]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(criterion='entropy',)
dtree = dtree.fit(X, y)

In [27]:
from sklearn.naive_bayes import GaussianNB

model1 = GaussianNB()
model1.fit(X, y)

In [28]:
def avarege_model(model1, model2, model3):
  y_pred1 = model1.predict(X_test)
  y_pred2 = model2.predict(X_test)
  y_pred3 = model3.predict(X_test)

  i = 0
  y_pred = []
  for num in y_pred1:
    if (77*y_pred1[i] + 70*y_pred2[i] + 75*y_pred3[i])/222 > 0.5:
      y_pred.append(1)
    else : y_pred.append(0)
    i = i + 1

  return y_pred


In [29]:
y_pred = avarege_model(model, dtree, model1)

print((y_pred == y_test).mean(), precision_score(y_test, y_pred, pos_label='positive',average='weighted'),
      recall_score(y_test, y_pred, pos_label='positive',
                                           average='weighted'),
      f1_score(y_test, y_pred, pos_label='positive',
                                           average='weighted'),)

0.7854938271604939 0.7822742543428384 0.7854938271604939 0.7804336768347322


