In [137]:
class BagOfWordsClassifier():
    def __init__(self):
        self.counts = {}
        self.counts_PCL = {}
        self.counts_not_PCL = {}
        self.PCL_word_count = 0
        self.no_PCL_word_count = 0
        self.PCL_document_count = 0
        self.no_PCL_document_count = 0

    def clean_text_tokenize(self, text):
        stop_words = ['this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there',
                      'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'don', "don't", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'shan', "shan't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

        text.lower()
        # removing " at start of sentences
        text = text.strip("\"")
        # replacing repetitions of punctations
        text = re.sub(r'\"+', '', text)

        # Tokenize links
        text = re.sub(r'https? : \S+', '[WEBSITE]', text)
        # removing referencing on usernames with @
        text = re.sub(r'@\S+', '', text)
        # removing smileys with : (like :),:D,:( etc)
        text = re.sub(r':\S+', '', text)
        # Remove punctation
        text = re.sub(r"[!.,;:?\'\"\´]", "", text)
        text = re.sub('(?<![\w])20[0-5][0-9]-?[0-9]*',
                    '[YEAR]', text)              # Year token
        text = re.sub('(?<![\w])1[0-9]{3}-?[0-9]*',
                    '[YEAR]', text)                 # Year token
        # replacing numbers with [NUM] tag  eg 1,000, 1.32, 5-7. Assert these numbers are not inside words (i.e. H1, )
        text = re.sub('(?<![\w])[0-9]+[.,]?[0-9]*(?![\w])', '[NUM]', text)
        text = re.sub('\[NUM\]-\[NUM\]', '[NUM]', text)
        # Again to delete account numbers lol 12-5223-231
        text = re.sub('\[NUM\]-\[NUM\]', '[NUM]', text)
        text = re.sub('(?<=\[NUM\])-(?=[a-zA-Z])', ' ', text)
        text = re.sub('[ ]*', ' ', text)
        text = re.sub('<h>', '.', text)

        porter = PorterStemmer()
        words = text.split()
        for i, word in enumerate(words):
            if word in stop_words:
                words.pop(i)
            else:
                words[i] = porter.stem(word)
        return words

    def train(self, train_DF):
        for i, row in train_DF.iterrows():
            text = row["text"]
            label = row["binary_label"]

            if label == 0:
                self.no_PCL_document_count += 1
            else:
                self.PCL_document_count += 1

            words = self.clean_text_tokenize(text)
            for word in words:
                self.counts[word] = 1 + \
                    (self.counts[word] if word in self.counts.keys() else 0)

                if label == 0:
                    self.no_PCL_word_count += 1
                    self.counts_not_PCL[word] = 1 + \
                        (self.counts_not_PCL[word]
                         if word in self.counts_not_PCL.keys() else 0)
                else:
                    self.PCL_word_count += 1
                    self.counts_PCL[word] = 1 + \
                        (self.counts_PCL[word]
                         if word in self.counts_PCL.keys() else 0)
                    
    def predict(self, sentences):

        prior = self.PCL_document_count / \
            (self.PCL_document_count + self.no_PCL_document_count)
        epsilon = 1  # epsilon smoothing
        if type(sentences) is str:
            sentences = [sentences]
        if type(sentences) is pd.DataFrame:
            i, sentences = sentences.iterrows()

        predictions = []
        for sentence in sentences:

            likelihood = 1
            for word in sentence:
                class_count = self.counts_PCL[word] if word in self.counts_PCL.keys(
                ) else 0
                likelihood *= (class_count+epsilon) / \
                    (len(self.counts) + self.PCL_word_count)

            prob_PCL = prior*likelihood

            likelihood = 1
            for word in sentence:
                class_count = self.counts_not_PCL[word] if word in self.counts_not_PCL.keys(
                ) else 0
                likelihood *= (class_count+epsilon) / \
                    (len(self.counts) + self.no_PCL_word_count)

            prob_not_PCL = (1-prior)*likelihood

            predictions.append(1 if prob_PCL > prob_not_PCL else 0)

        return predictions

In [149]:
import pandas as pd
import numpy as np
import os

from config import DATA_FOLDER, DATA_PCL_NAME, DATA_CATEGORIES_NAME
from utils import Utils

df = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            DATA_PCL_NAME
        ))

df = df.dropna()

train_id = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            "train_semeval_parids-labels.csv"
        ))
dev_id = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            "dev_semeval_parids-labels.csv"
        ))

data_pcl_train = df[df["par_id"].isin(train_id["par_id"].tolist())]
data_pcl_dev = df[df["par_id"].isin(dev_id["par_id"].tolist())]

In [None]:
import gensim

# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(os.path.dirname(os.getcwd()),"models", "GoogleNews-vectors-negative300.bin.gz"), binary=True)

In [150]:
def word_emedding_average(text):
    words = text.split()
    embeddings = []
    for word in words:
        if word in model.key_to_index:
            vector = model[word]
            embeddings.append(vector)
    return np.mean(np.array(embeddings), axis=0)

In [151]:
data_pcl_train["length_text"] = data_pcl_train["text"].apply(lambda x: len(x))
data_pcl_train["word_embedding"] = data_pcl_train["text"].apply(lambda x: word_emedding_average(x))
# apply lambda function to create new columns
data_pcl_train[[i for i in range(300)]] = data_pcl_train['word_embedding'].apply(lambda x: pd.Series(x))
# drop the original list column
data_pcl_train.drop(['word_embedding','par_id','art_id','text','label'], axis=1, inplace=True)
data_pcl_train["country_code"] = pd.Categorical(data_pcl_train["country_code"], categories=data_pcl_train["country_code"].unique()).codes
data_pcl_train["keyword"] = pd.Categorical(data_pcl_train["keyword"], categories=data_pcl_train["keyword"].unique()).codes
data_pcl_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pcl_train["length_text"] = data_pcl_train["text"].apply(lambda x: len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pcl_train["word_embedding"] = data_pcl_train["text"].apply(lambda x: word_emedding_average(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pcl_train[[i for i in

Unnamed: 0,keyword,country_code,binary_label,length_text,0,1,2,3,4,5,...,290,291,292,293,294,295,296,297,298,299
0,0,0,0.0,620,0.067801,0.035783,0.017348,0.100465,-0.077928,-0.024976,...,-0.059775,0.005240,-0.082189,0.013069,-0.040588,-0.008144,-0.028159,-0.020061,0.077180,-0.023778
1,1,1,0.0,237,0.010394,0.056516,0.035743,0.092903,-0.074452,-0.060100,...,-0.077651,-0.030709,-0.053521,-0.041894,-0.004974,0.063197,-0.021971,-0.014058,0.038138,-0.009066
2,2,2,0.0,162,-0.043460,0.044146,0.052855,0.025080,-0.100702,-0.071572,...,-0.053907,0.004824,0.041070,0.000416,-0.020813,0.019182,0.069169,-0.086346,0.100897,-0.042547
3,3,3,0.0,162,0.010649,0.022886,0.078101,0.084868,-0.089789,-0.084727,...,-0.091961,-0.005387,-0.085330,-0.036522,0.085322,0.037345,0.004543,-0.085313,0.046343,-0.080673
4,4,4,0.0,277,0.038921,0.070032,0.036560,0.089887,-0.029526,-0.045057,...,-0.116035,0.008618,-0.032478,0.083207,0.062838,0.071537,0.017735,-0.027410,0.098935,0.010262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10407,6,9,1.0,137,0.020680,0.042316,-0.009072,0.079590,-0.054831,0.013768,...,-0.084430,0.078445,-0.118653,-0.000064,-0.081624,0.062847,0.018596,0.007382,0.031353,-0.046667
10423,8,13,1.0,137,0.024166,0.019599,0.047072,0.119496,-0.041599,-0.006340,...,-0.045100,0.062841,-0.084494,0.002842,-0.061343,0.004737,0.051113,-0.055402,0.062909,-0.025461
10444,4,7,1.0,112,0.083260,0.134733,-0.026842,0.070793,-0.009792,0.029056,...,-0.108350,-0.032273,-0.134884,0.025968,0.013818,-0.069849,-0.060660,0.014223,0.094438,-0.034882
10453,7,2,1.0,244,0.001408,0.050710,0.006924,0.079784,-0.098354,-0.007317,...,-0.058295,0.007331,-0.068108,-0.023816,-0.039317,0.004857,-0.002560,0.018707,0.045690,-0.016291


In [152]:
from sklearn.linear_model import LogisticRegression

Y_train = data_pcl_train["binary_label"].astype("float").to_numpy()
X_train = data_pcl_train.drop(['binary_label'], axis=1, inplace=False).astype("float").to_numpy()

logistic_regression = LogisticRegression(random_state=0).fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [153]:
data_pcl_dev["length_text"] = data_pcl_dev["text"].apply(lambda x: len(x))
data_pcl_dev["word_embedding"] = data_pcl_dev["text"].apply(lambda x: word_emedding_average(x))
# apply lambda function to create new columns
data_pcl_dev[[i for i in range(300)]] = data_pcl_dev['word_embedding'].apply(lambda x: pd.Series(x))
# drop the original list column
data_pcl_dev.drop(['word_embedding','par_id','art_id','text','label'], axis=1, inplace=True)
data_pcl_dev["country_code"] = pd.Categorical(data_pcl_dev["country_code"], categories=data_pcl_dev["country_code"].unique()).codes
data_pcl_dev["keyword"] = pd.Categorical(data_pcl_dev["keyword"], categories=data_pcl_dev["keyword"].unique()).codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pcl_dev["length_text"] = data_pcl_dev["text"].apply(lambda x: len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pcl_dev["word_embedding"] = data_pcl_dev["text"].apply(lambda x: word_emedding_average(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pcl_dev[[i for i in range(300

In [154]:
from sklearn.metrics import f1_score

y_pred = logistic_regression.predict(X_train)
print(f"F1-score on train:{f1_score(Y_train, y_pred, average='macro')}")

Y_test = data_pcl_dev["binary_label"].astype("float").to_numpy()
X_test = data_pcl_dev.drop(['binary_label'], axis=1, inplace=False).astype("float").to_numpy()

y_pred = logistic_regression.predict(X_test)
print(f"F1-score on dev:{f1_score(Y_test, y_pred, average='macro')}")

F1-score on train:0.5271660410477846
F1-score on dev:0.5338360560358542


In [147]:
import pandas as pd
import numpy as np
import os
from nltk.stem import PorterStemmer
import re 

from config import DATA_FOLDER, DATA_PCL_NAME, DATA_CATEGORIES_NAME
from utils import Utils

df = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            DATA_PCL_NAME
        ))

df = df.dropna()

train_id = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            "train_semeval_parids-labels.csv"
        ))
dev_id = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            "dev_semeval_parids-labels.csv"
        ))

data_pcl_train = df[df["par_id"].isin(train_id["par_id"].tolist())]
data_pcl_dev = df[df["par_id"].isin(dev_id["par_id"].tolist())]

classifier_bow = BagOfWordsClassifier()
classifier_bow.train(data_pcl_train)
y_pred = classifier_bow.predict(data_pcl_train['text'])
print(f"F1-score on train:{f1_score(Y_train, y_pred, average='macro')}")

y_pred = classifier_bow.predict(data_pcl_dev['text'])
print(f"F1-score on dev:{f1_score(Y_test, y_pred, average='macro')}")

F1-score on train:0.4864682403042344
F1-score on dev:0.4710545826540191
