In [2]:
"""Build Baseline Location Classifier
Approaches:
- instance representation in form of 'bag' of words 
    - features: word frequencies, metadata 
    - exclude rare words in data set (used by less than 3 users )
- gramatical structure with NLP
- model instances in terms of authors instead of documents 


- Baseline Classifier: Naive Bayes Model ()

"""


"Build Baseline Location Classifier\nApproaches:\n- instance representation in form of 'bag' of words \n    - features: word frequencies, metadata \n    - exclude rare words in data set (used by less than 3 users )\n- gramatical structure with NLP\n- model instances in terms of authors instead of documents \n\n\n- Baseline Classifier: Naive Bayes Model ()\n\n"

In [4]:
import pandas as pd
import numpy as np
import re, string
from os import path, getcwd
from collections import defaultdict
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2

from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

import validators
import nltk
from info_gain.info_gain import info_gain_ratio
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

# nltk.download('english')
# nltk.download('punkt')

In [5]:
fdir = path.join(getcwd(), "2019S1-proj2-datah")

train_data = "train-raw.tsv"
test_data = "test-raw.tsv"
dev_data = "dev-raw.tsv"

train_fpath = path.join(fdir, train_data)
test_fpath = path.join(fdir, test_data)
dev_fpath = path.join(fdir, dev_data)

train = pd.read_csv(train_fpath, encoding="utf_8", delimiter="\t", index_col="Instance_ID")
test = pd.read_csv(test_fpath, encoding="utf_8", delimiter="\t" , index_col="Instance_ID")
dev = pd.read_csv(dev_fpath, encoding="utf_8", delimiter="\t" , index_col="Instance_ID")

In [6]:
inputs = 'Text'
output = 'Location'
x_train = train[inputs]
y_train = train[output]
x_test = test[inputs]
y_test = test[output]
x_dev = dev[inputs]
y_dev = dev[output]

In [9]:
class GeoTagger:
    _FEATURE_SELECTION = ["baseline_10", "baseline_50", "baseline_100", "info_gain_ratio", "word_locality_heuristic", "tf_idf"]
    _VOTING_STRATEGY = ["simple_voting", "bagging", "stacking", "random_forest", "boosting"]
    _CLASSIFIERS = ["Zero-R", "One-R", "Decision-Tree", "MultinomialNB", "LinearSVM", "SemiSupervised"]
    _EVALUATION_METRIC = ["accuracy", "precision_recall_f-score_with_macro", "precision_recall_f-score_with_micro"]
    
    def __init__(self, inputs, target, classifier_set=["MultinomialNB"], voting_strategy="simple_voting", feature_selection_method="baseline_100", seed=500, combine_classifiers=False, n_features=400):
        self.inputs = inputs
        self.target = target
        self.classifier_set = classifier_set
        self.voting_strategy = voting_strategy
        self.feature_selection_method = feature_selection_method
        self.seed = seed
        np.random.seed(seed)
        self.stemmer = SnowballStemmer('english')
        self.stop_words = set(stopwords.words('english'))
        self.combine_classifiers = combine_classifiers
        self.classifier_set = self._combine_classifier_set(classifier_set)
        self.n_features = n_features

    def train(self, X, y):
        """
        trains a classifier given the training data and their corresponding class labels
        """
        self.classes = y.unique()
        X = self.preprocess(X, y, train=True)
        
        for classifier in self.classifier_set.values():
            print(type(classifier))
            classifier.fit(X, y)
          
    def predict(self, X):
        """
        predicts a set of classifiers given some development data
        """
        X = self.preprocess(X)
        predictions = pd.DataFrame()
                
        for name, classifier in self.classifier_set.items():
            classifier_prediction = classifier.predict(X)
            predictions[name] = classifier_prediction
            
        return predictions
    
    def evaluate(self, ybar, y, metric):
        """
        evaluates a class' predictions given the correct class labels and an evaluation metric
        """
        if not metric in GeoTagger._EVALUATION_METRIC:
            print("Invalid Evaluation Metric: {}. Choose one of \
                ({})".format(metric, ", ".join(GeoTagger._EVALUATION_METRIC))) 
                return
            
        score_set = defaultdict()
        classifiers = list(ybar)
        for classifier in classifiers:
            if metric == "accuracy":
                score_set[classifier] = accuracy_score(ybar[classifier], y)
            if metric == "precision_recall_f-score_with_macro":
                score_set[classifier] = precision_recall_fscore_support(y, ybar[classifier], average='macro')
            if metric == "precision_recall_f-score_with_micro":
                score_set[classifier] = precision_recall_fscore_support(y, ybar[classifier], average='micro')

        return score_set
    
    def preprocess(self, X, y=None, train=False):
        """
         - Filter rare words (urls, typos rare names, punctuation symbols)
         - calculate word frequencies 
         - metadata
        """
        X = self.filter(X)

        if train:
            self.feature_selection(X, y)
        
        X = self.bag_of_words(X)
            
        return X 
    
    def bag_of_words(self, X):
        _x = pd.DataFrame(
            [[(word in text) for word in sorted(list(self.features))] for text in X.values],
            index=X.index, 
            columns=self.features,
            dtype=np.uint8
        )        
        return _x
    
    def _combine_classifier_set(self, classifiers):
        classifier_set = defaultdict()
        
        for classifier in classifiers:
            if not classifier in GeoTagger._CLASSIFIERS:
                print("Invalid Classifier: {}. Choose one of \
                ({})".format(classifier, ", ".join(GeoTagger._CLASSIFIERS)))
                continue
                
            if classifier == "Zero-R":
                classifier_set[classifier] = DummyClassifier(strategy='most_frequent', random_state=self.seed)
            elif classifier == "One-R":
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=1, criterion="entropy", random_state=self.seed)
            elif classifier == "Decision-Tree":
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=None, criterion="entropy", random_state=self.seed)
            elif classifier == "MultinomialNB":
                classifier_set[classifier] = MultinomialNB()
            elif classifier == "LinearSVM":
                classifier_set[classifier] = svm.LinearSVC(random_state=self.seed)
            elif classifier == "SemiSupervised":
                classifier_set[classifier] = LabelSpreading(kernel="knn", n_neighbors=7, alpha=0.2)
        return classifier_set
                
    def feature_selection(self, X, y):
        """
        (1) Information Gain Ratio (IGR) - across all states S, is 
            defined as the ratio between its information gain value IG, 
            which measures the decrease in class entropy H that w brings,
            and its intrinsic entropy IV, which measures the entropy of 
            the presence versus the absence of that word
            
        (2) Word Locality Heuristic (WLH) - promotes words primarily 
            associated with one location. measure the probability of 
            a word occurring in a state, divided by its probability to 
            appear in any state. Then, for a given word w, we define the 
            WLH as the maximum such probability across all the states S
        """
        if self.feature_selection_method not in GeoTagger._FEATURE_SELECTION:
            print("Invalid Feature Selection method: {}. Choose one of \
            ({})".format(self.feature_selection_method, ", ".join(GeoTagger._FEATURE_SELECTION)))
            return 
        
        if self.feature_selection_method == "baseline_10":
            self.baseline_heuristic(X, y, "10")
        elif self.feature_selection_method == "baseline_50":
            self.baseline_heuristic(X, y, "50")
        elif self.feature_selection_method == "baseline_100":
            self.baseline_heuristic(X, y, "100")
        elif self.feature_selection_method == "info_gain_ratio":
#             self.information_gain_ratio(x)
            return
        elif self.feature_selection_method == "word_locality_heuristic":
            self.word_locality_weight(X, y)
        elif self.feature_selection_method == "tf_idf":
            self.tf_idf(X, y)

    def baseline_heuristic(self, X, y, top_n):
#         fdir = path.join(getcwd(), "2019S1-proj2-datah")
        feature_fpath = path.join(fdir, "train-top" + top_n + ".csv")
        
        if not path.exists(feature_fpath):
            print("Baseline Heuristic path {} does not exist".format(feature_fpath))
            return
        
        features = open(feature_fpath).readline()
        features = features.split(",")
        features.remove("Instance_ID")
        features.remove("Location\n")
        self.features = set(features)

    def word_locality_weight(self, X, y):
        """
        calculate frequencies of data 
        Measure frequency and divide by sum of freqencies of all words
        """
        locations = self.classes + ['Total',]
        word_locality_features = {label: defaultdict() for label in locations}
        word_locality_weight = {label: defaultdict() for label in self.classes}
        
        for x_i, y_i in zip(X.index, y.index):
            text = X.loc[x_i].split()
            for word in text:
                word_locality_features[y.loc[y_i]][word] += 1
                word_locality_features[y.loc[y_i]]['Total'] += 1
                word_locality_features['Total'][word] += 1
                word_locality_features['Total']['Total'] += 1
        
        for label in self.classes:
            for word in word_locality_features[label].keys():
                cond_word_prob = word_locality_features[label][word] / word_locality_features[label]['Total']
                word_prob = word_locality_features['Total'][word] / word_locality_features['Total']['Total']
                word_locality_weight[label][word] = cond_word_prob / word_prob
        
        features = []
        for location in self.classes:
            n_location_features = int(self.n_features / len(self.classes))
            features.append(sorted(word_locality_weight[location].items(), key=lambda kv: kv[1], reverse=True)[:n_location_features]) 
        
        self.features = set([feature[0] for feature in features])
                
    
    def information_gain_ratio(self, x):
#         return info_gain_ratio
        pass
    
    def tf_idf(self, X, y):
        vectorizer = TfidfVectorizer(stop_words=self.stop_words, max_features=self.n_features)

        location_word_list = {label: '' for label in self.classes}

        for x_i, y_i in zip(X.index, y.index):
            location_word_list[y.loc[y_i]] += X.loc[x_i] + " "
        
        labels = location_word_list.keys()
        corpus = location_word_list.values()
        vectorizer.fit(corpus, labels)
        self.features = set(vectorizer.get_feature_names())
        
    def filter(self, x):
        return x.apply(self.filter_text)
    
    def filter_text(self, text):
        return ' '.join(self.filter_word(w) for w in text.split())
        
    def filter_word(self, word):
        word = word.lower()
        # extract keywords from hashtag 
        if self._is_hyperlink(word):
            return ''
        elif self._is_hashtag(word):
            word = self._process_hashtag(word)
        # potentially cross-reference individuals mentioned? or discard
        elif self._is_mention(word):
            word = self._process_mention(word)
        # remove ascii characters 
        else:
            word = self._ascii_to_unicode(word)
            word = self._word_stem(word)
            word = re.sub(r'[^\w\s]',' ', word)
        return word
                
    def _is_hashtag(self, word):
        if len(word) == 0:
            return False
        return word[0] == "#"
    
    def _is_mention(self, word):
        if len(word) == 0:
            return False
        return word[0] == "@"
    
    def _is_hyperlink(self, word):
        return validators.url(word)
    
    def _process_hashtag(self, word):
        return word[1:]
    
    def _process_mention(self, word):
        return word[1:]
    
    def _ascii_to_unicode(self, word):
        for uescape in re.findall(r'(\\u[0-9a-f]{4})', word):
            try:
#                 print(uescape.encode('utf-8').decode('unicode-escape'), type(uescape.encode('utf-8').decode('unicode-escape')))
#                 word = re.sub(uescape, uescape.encode('utf-8'), word)
#                 print(word)
#                 print(uescape, type(uescape))
                word = word.replace(uescape, '')  
            except (UnicodeDecodeError, Exception):
                print("Failed to decode: {}".format(uescape))
        return word
    
    def _word_stem(self, word):
        return self.stemmer.stem(word)
    

In [None]:
classifier_set = ["SemiSupervised"]
voting_strategy = "simple_voting"
gt = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    voting_strategy = voting_strategy,
    feature_selection_method = "tf_idf",
    seed = 500,
    combine_classifiers = False
)

start = datetime.now()
gt.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt2 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    voting_strategy = voting_strategy,
    feature_selection_method = "baseline_10",
    seed = 500,
    combine_classifiers = False
)

start = datetime.now()
gt2.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt3 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    voting_strategy = voting_strategy,
    feature_selection_method = "baseline_50",
    seed = 500,
    combine_classifiers = False
)

start = datetime.now()
gt3.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt4 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    voting_strategy = voting_strategy,
    feature_selection_method = "baseline_100",
    seed = 500,
    combine_classifiers = False
)

start = datetime.now()
gt4.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

In [103]:
#predict the class labels of a set of test data
ybars = gt.predict(x_dev)
ybars2 = gt2.predict(x_dev)
ybars3 = gt3.predict(x_dev)
ybars4 = gt4.predict(x_dev)

KeyboardInterrupt: 

In [None]:
# evaluate classifier performanceq
accScores = gt.evaluate(ybars, y_dev, "accuracy")
otherScores = gt.evaluate(ybars, y_dev, "precision_recall_f-score_with_micro")
accScores2 = gt2.evaluate(ybars2, y_dev, "accuracy")
otherScores2 = gt2.evaluate(ybars2, y_dev, "precision_recall_f-score_with_micro")
accScores3 = gt3.evaluate(ybars3, y_dev, "accuracy")
otherScores3 = gt3.evaluate(ybars3, y_dev, "precision_recall_f-score_with_micro")
accScores4 = gt4.evaluate(ybars4, y_dev, "accuracy")
otherScores4 = gt4.evaluate(ybars4, y_dev, "precision_recall_f-score_with_micro")

In [None]:
accScores3

In [None]:
otherScores

### Some Resources
- https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [20]:
print(len([1,2,3,4]))

4
