In [41]:
import pandas as pd
import numpy as np
import re, string
from os import path, getcwd
from collections import defaultdict
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2

from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier

from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import validators
import nltk
from info_gain.info_gain import info_gain_ratio
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

In [42]:
fdir = path.join(getcwd(), "2019S1-proj2-datah")

train_data = "train-raw.tsv"
test_data = "test-raw.tsv"
dev_data = "dev-raw.tsv"

train_fpath = path.join(fdir, train_data)
test_fpath = path.join(fdir, test_data)
dev_fpath = path.join(fdir, dev_data)

train = pd.read_csv(train_fpath, encoding="utf_8", delimiter="\t", index_col="Instance_ID")
test = pd.read_csv(test_fpath, encoding="utf_8", delimiter="\t" , index_col="Instance_ID")
dev = pd.read_csv(dev_fpath, encoding="utf_8", delimiter="\t" , index_col="Instance_ID")

In [43]:
inputs = 'Text'
output = 'Location'
x_train = train[inputs]
y_train = train[output]
x_test = test[inputs]
y_test = test[output]
x_dev = dev[inputs]
y_dev = dev[output]

In [53]:
class GeoTagger:
    _FEATURE_SELECTION = ["baseline_10", "baseline_50", "baseline_100", "info_gain_ratio", "word_locality_heuristic", "tf_idf"]
    _ENSEMBLE_STRATEGY = ["simple_voting", "meta_classification", "bagging", "random_forest", "boosting"]
    _CLASSIFIERS = ["Zero-R", "One-R", "Decision-Tree", "MultinomialNB", "LinearSVM", "SemiSupervised"]
    _EVALUATION_METRIC = ["accuracy", "precision_recall_f-score_with_macro", "precision_recall_f-score_with_micro"]
    
    def __init__(self, inputs, target, classifier_set=["MultinomialNB"], ensemble_strategy="simple_voting", feature_selection_method="baseline_100", seed=500, combine_classifiers=False, n_features=400):
        self.inputs = inputs
        self.target = target
        self.classifier_set = classifier_set
        self.ensemble_strategy = ensemble_strategy
        self.feature_selection_method = feature_selection_method
        self.seed = seed
        np.random.seed(seed)
        self.stemmer = SnowballStemmer('english')
        self.stop_words = set(stopwords.words('english'))
        self.combine_classifiers = combine_classifiers
        self.classifier_set = self._generate_classifier_set(classifier_set)
        self.combined_classifier = None if not self.combine_classifiers else self.generate_ensemble_classifier()
        self.n_features = n_features

    def train(self, X, y):
        """
        trains a classifier given the training data and their corresponding class labels
        """
        self.classes = y.unique()
        X = self.preprocess(X, y, train=True)
        
        if self.combine_classifiers:
            self.combined_classifier.fit(X, y)
        else:
            for classifier in self.classifier_set.values():
                classifier.fit(X, y)

    def predict(self, X):
        """
        predicts a set of classifiers given some development data
        """
        X = self.preprocess(X)
        
        if self.combine_classifiers:
            return self.combined_classifier.predict(X)
        else:
            y_set = pd.DataFrame()

            for name, classifier in self.classifier_set.items():
                classifier_prediction = classifier.predict(X)
                y_set[name] = classifier_prediction
                
        return y_set
        
    
    def evaluate(self, ybar, y):
        """
        evaluates a class' predictions given the correct class labels and an evaluation metric
        """
#         if not metric in GeoTagger._EVALUATION_METRIC:
#             print("Invalid Evaluation Metric: {}. Choose one of \
#             ({})".format(metric, ", ".join(GeoTagger._EVALUATION_METRIC)))
#             return
            
        score_set = defaultdict()
        
        if self.combine_classifiers:
            classifiers = [self.ensemble_strategy, ]
            ybar = pd.DataFrame(ybar, columns=classifiers, index = y.index)
        else:
            classifiers = self.classifier_set

        for name, y_pred in ybar.items():
            accuracy = accuracy_score(y, y_pred)
            score_set[name] = accuracy 
#             report = classification_report(ybar, y, self.target)
#             confusion = metrics.confusion_matrix(y_test, y_pred_class)

        return score_set
    
    def cross_validation(self, X, y, metric):
        score_set = defaultdict()
        
        X = self.preprocess(X)

        if self.combine_classifiers:
            classifiers = [self.ensemble_strategy, ]
            ybar = pd.DataFrame(ybar, columns=classifiers, index = y.index)
        else:
            classifiers = self.classifier_set

        for classifier in classifiers:
            score_set[metric] = cross_validate(classifier, X, y, cv=10)
        return score_set
    
    
    def preprocess(self, X, y=None, train=False):
        """
         - Filter rare words (urls, typos rare names, punctuation symbols)
         - calculate word frequencies 
         - metadata
        """
        X = self.filter(X)

        if train:
            self.feature_selection(X, y)
        
        X = self.bag_of_words(X)
            
        return X 
    
    def bag_of_words(self, X):
        _x = pd.DataFrame(
            [[(word in text) for word in sorted(list(self.features))] for text in X.values],
            index=X.index, 
            columns=self.features,
            dtype=np.uint8
        )        
        return _x
    
    def generate_ensemble_classifier(self):
        if not self.combine_classifiers:
            return None
        
        if not self.ensemble_strategy in GeoTagger._ENSEMBLE_STRATEGY:
            print("Invalid Ensemble Strategy Metric: {}. Choose one of \
            ({})".format(metric, ", ".join(GeoTagger._ENSEMBLE_STRATEGY)))
            return None
        
        if self.ensemble_strategy == "simple_voting":
            combined_classifier = VotingClassifier(self.classifier_set.items(), 'hard')
        elif self.ensemble_strategy == "meta_classification":
            combined_classifier = MetaClassifier(self.classifier_set.items(), self.seed)
        elif self.ensemble_strategy == "bagging":
            base_classifier = DecisionTreeClassifier(max_features=None, max_leaf_nodes=999)
            combined_classifier = BaggingClassifier(base_estimator=base_classifier, max_features=self.n_features, random_state=self.seed)
        elif self.ensemble_strategy == "random_forest":
            combined_classifier = RandomForestClassifier()
        elif self.ensemble_strategy == "boosting":
            combined_classifier = GradientBoostingClassifier()
        
        return combined_classifier
        
    
    def _generate_classifier_set(self, classifiers):
        classifier_set = defaultdict()
        
        for classifier in classifiers:
            if not classifier in GeoTagger._CLASSIFIERS:
                print("Invalid Classifier: {}. Choose one of \
                ({})".format(classifier, ", ".join(GeoTagger._CLASSIFIERS)))
                continue
                
            if classifier == "Zero-R":
                classifier_set[classifier] = DummyClassifier(strategy='most_frequent', random_state=self.seed)
            elif classifier == "One-R":
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=1, criterion="entropy", random_state=self.seed)
            elif classifier == "Decision-Tree":
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=None, criterion="entropy", random_state=self.seed)
            elif classifier == "MultinomialNB":
                classifier_set[classifier] = MultinomialNB()
            elif classifier == "LinearSVM":
                classifier_set[classifier] = svm.LinearSVC(random_state=self.seed)
            elif classifier == "SemiSupervised":
                classifier_set[classifier] = LabelSpreading(kernel="knn", n_neighbors=7, alpha=0.2)
        return classifier_set
                
    def feature_selection(self, X, y):
        """
        (1) Information Gain Ratio (IGR) - across all states S, is 
            defined as the ratio between its information gain value IG, 
            which measures the decrease in class entropy H that w brings,
            and its intrinsic entropy IV, which measures the entropy of 
            the presence versus the absence of that word
            
        (2) Word Locality Heuristic (WLH) - promotes words primarily 
            associated with one location. measure the probability of 
            a word occurring in a state, divided by its probability to 
            appear in any state. Then, for a given word w, we define the 
            WLH as the maximum such probability across all the states S
        """
        if self.feature_selection_method not in GeoTagger._FEATURE_SELECTION:
            print("Invalid Feature Selection method: {}. Choose one of \
            ({})".format(self.feature_selection_method, ", ".join(GeoTagger._FEATURE_SELECTION)))
            return 
        
        if self.feature_selection_method == "baseline_10":
            self.baseline_heuristic(X, y, "10")
        elif self.feature_selection_method == "baseline_50":
            self.baseline_heuristic(X, y, "50")
        elif self.feature_selection_method == "baseline_100":
            self.baseline_heuristic(X, y, "100")
        elif self.feature_selection_method == "info_gain_ratio":
#             self.information_gain_ratio(x)
            return
        elif self.feature_selection_method == "word_locality_heuristic":
            self.word_locality_weight(X, y)
        elif self.feature_selection_method == "tf_idf":
            self.term_frequency_inverse_city_frequency(X, y)

    def baseline_heuristic(self, X, y, top_n):
        feature_fpath = path.join(fdir, "train-top" + top_n + ".csv")
        
        if not path.exists(feature_fpath):
            print("Baseline Heuristic path {} does not exist".format(feature_fpath))
            return
        
        features = open(feature_fpath).readline()
        features = features.split(",")
        features.remove("Instance_ID")
        features.remove("Location\n")
        self.features = set(features)

    def word_locality_weight(self, X, y):
        """
        calculate frequencies of data 
        Measure frequency and divide by sum of freqencies of all words
        """
        locations = self.classes + ['Total',]
        word_locality_features = {label: defaultdict() for label in locations}
        word_locality_weight = {label: defaultdict() for label in self.classes}
        
        for x_i, y_i in zip(X.index, y.index):
            text = X.loc[x_i].split()
            for word in text:
                word_locality_features[y.loc[y_i]][word] += 1
                word_locality_features[y.loc[y_i]]['Total'] += 1
                word_locality_features['Total'][word] += 1
                word_locality_features['Total']['Total'] += 1
        
        for label in self.classes:
            for word in word_locality_features[label].keys():
                cond_word_prob = word_locality_features[label][word] / word_locality_features[label]['Total']
                word_prob = word_locality_features['Total'][word] / word_locality_features['Total']['Total']
                word_locality_weight[label][word] = cond_word_prob / word_prob
        
        features = []
        for location in self.classes:
            n_location_features = int(self.n_features / len(self.classes))
            features.append(sorted(word_locality_weight[location].items(), key=lambda kv: kv[1], reverse=True)[:n_location_features]) 
        
        self.features = set([feature[0] for feature in features])
                
    
    def information_gain_ratio(self, x):
#         return info_gain_ratio
        pass
    
    def term_frequency_inverse_city_frequency(self, X, y):
        vectorizer = TfidfVectorizer(stop_words=self.stop_words, max_features=self.n_features)

        location_word_list = {label: '' for label in self.classes}

        for x_i, y_i in zip(X.index, y.index):
            location_word_list[y.loc[y_i]] += X.loc[x_i] + " "
        
        labels = location_word_list.keys()
        corpus = location_word_list.values()
        vectorizer.fit(corpus, labels)
        self.features = set(vectorizer.get_feature_names())
        
    def filter(self, x):
        return x.apply(self.filter_text)
    
    def filter_text(self, text):
        return ' '.join(self.filter_word(w) for w in text.split())
        
    def filter_word(self, word):
        word = word.lower()
        # extract keywords from hashtag 

        if self._is_hyperlink(word):
            return ''
        elif self._is_hashtag(word):
            word = self._process_hashtag(word)
        # potentially cross-reference individuals mentioned? or discard
        elif self._is_mention(word):
            word = self._process_mention(word)
        # remove ascii characters 
        else:
            word = self._ascii_to_unicode(word)
            
            word = re.sub(r'[^\w\s]',' ', word)
#             word = self._word_stem(word)
            if word in self.stop_words:
                return ''
        return word
                
    def _is_hashtag(self, word):
        if len(word) == 0:
            return False
        return word[0] == "#"
    
    def _is_mention(self, word):
        if len(word) == 0:
            return False
        return word[0] == "@"
    
    def _is_hyperlink(self, word):
        return validators.url(word)
    
    def _process_hashtag(self, word):
        return word[1:]
    
    def _process_mention(self, word):
        return word[1:]
    
    def _ascii_to_unicode(self, word):
        for uescape in re.findall(r'(\\u[0-9a-f]{4})', word):
            try:
                word = word.replace(uescape, '')  
            except (UnicodeDecodeError, Exception):
                print("Failed to decode: {}".format(uescape))
        return word
    
    def _word_stem(self, word):
        return self.stemmer.stem(word)

class MetaClassifier:
    def __init__(self, estimators, random_state):
        self.estimators = estimators
        self.encoder = OneHotEncoder()
        self.base_classifier = svm.LinearSVC(random_state=random_state)
    
    def fit(self, X, y):
        y_set = pd.DataFrame(index=X.index, columns=[item[0] for item in self.estimators])
        
        for name, classifier in self.estimators:
            classifier.fit(X, y)
            y_bar = classifier.predict(X)
            y_set[name] = self.encoder.fit_transform(y_bar.reshape(-1, 1)).toarray()
            
        self.base_classifier.fit(y_set, y)
    
    def predict(self, X):
        y_set = pd.DataFrame(index=X.index, columns=[item[0] for item in self.estimators])

        for name, classifier in self.estimators:
            y_bar = classifier.predict(X)
            y_set[name] = self.encoder.transform(y_bar.reshape(-1, 1)).toarray()
                
        return  self.base_classifier.predict(y_set)

In [54]:
# classifier_set = ["One-R"]
classifier_set = ["LinearSVM", "MultinomialNB"]
print(classifier_set)
voting_strategy = "simple_voting"
combine_classifiers = False

gt = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "tf_idf",
    seed = 500,
    combine_classifiers = combine_classifiers
)

start = datetime.now()
gt.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt2 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "baseline_10",
    seed = 500,
    combine_classifiers = combine_classifiers
)

start = datetime.now()
gt2.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt3 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "baseline_50",
    seed = 500,
    combine_classifiers = combine_classifiers
)

start = datetime.now()
gt3.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt4 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "baseline_100",
    seed = 500,
    combine_classifiers = combine_classifiers
)

start = datetime.now()
gt4.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

['One-R', 'Decision-Tree', 'MultinomialNB', 'LinearSVM']
Time taken: 0:08:24.903216
Time taken: 0:02:37.557461
Time taken: 0:03:34.017542
Time taken: 0:04:44.567360


In [55]:
#predict the class labels of a set of test data
ybars = gt.predict(x_dev)
ybars2 = gt2.predict(x_dev)
ybars3 = gt3.predict(x_dev)
ybars4 = gt4.predict(x_dev)

In [56]:
# models = [gt, gt2, gt3, gt4]
# predictions = [ybars, ybars2, ybars3, ybars4]
# evals = ["accuracy", "precision_recall_f-score_with_micro"]

# for model, prediction in zip(models, predictions):
#     for method in evals:
#         report, confusion = model.evaluate(prediction, y_dev, method)
#         print("{}: {}".format(method, report, ))
accScores = gt.evaluate(ybars, y_dev)
# otherScores = gt.evaluate(ybars, y_dev, "precision_recall_f-score_with_micro")
accScores2 = gt2.evaluate(ybars2, y_dev)
# otherScores2 = gt2.evaluate(ybars2, y_dev, "precision_recall_f-score_with_micro")
accScores3 = gt3.evaluate(ybars3, y_dev)
# otherScores3 = gt3.evaluate(ybars3, y_dev, "precision_recall_f-score_with_micro")
accScores4 = gt4.evaluate(ybars4, y_dev)
# otherScores4 = gt4.evaluate(ybars4, y_dev, "precision_recall_f-score_with_micro")

In [58]:
print(accScores, accScores2, accScores3, accScores4)

defaultdict(None, {'One-R': 0.2693029490616622, 'Decision-Tree': 0.3017426273458445, 'MultinomialNB': 0.31847184986595173, 'LinearSVM': 0.3174530831099196}) defaultdict(None, {'One-R': 0.2693029490616622, 'Decision-Tree': 0.3152278820375335, 'MultinomialNB': 0.3106166219839142, 'LinearSVM': 0.31549597855227884}) defaultdict(None, {'One-R': 0.2693029490616622, 'Decision-Tree': 0.31056300268096515, 'MultinomialNB': 0.3155495978552279, 'LinearSVM': 0.3202412868632708}) defaultdict(None, {'One-R': 0.2693029490616622, 'Decision-Tree': 0.3124128686327078, 'MultinomialNB': 0.326970509383378, 'LinearSVM': 0.3225469168900804})


In [57]:
print(np.unique(ybars, return_counts=True))
print(np.unique(ybars2, return_counts=True))
print(np.unique(ybars3, return_counts=True))
print(np.unique(ybars4, return_counts=True))

(array(['Brisbane', 'Melbourne', 'Perth', 'Sydney'], dtype=object), array([62648, 27532, 26591, 32429], dtype=int64))
(array(['Brisbane', 'Melbourne', 'Perth', 'Sydney'], dtype=object), array([71147, 57048, 15850,  5155], dtype=int64))
(array(['Brisbane', 'Melbourne', 'Perth', 'Sydney'], dtype=object), array([64150, 22852, 11545, 50653], dtype=int64))
(array(['Brisbane', 'Melbourne', 'Perth', 'Sydney'], dtype=object), array([69120, 18512, 19236, 42332], dtype=int64))


Unnamed: 0,One-R,Decision-Tree,MultinomialNB,LinearSVM
0,Brisbane,Perth,Brisbane,Sydney
1,Brisbane,Sydney,Perth,Sydney
2,Brisbane,Brisbane,Brisbane,Brisbane
3,Brisbane,Perth,Sydney,Perth
4,Brisbane,Brisbane,Brisbane,Brisbane
5,Brisbane,Brisbane,Melbourne,Perth
6,Brisbane,Melbourne,Melbourne,Melbourne
7,Brisbane,Perth,Brisbane,Sydney
8,Brisbane,Perth,Brisbane,Brisbane
9,Brisbane,Brisbane,Sydney,Melbourne


### Some Resources
- https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34