In [3]:
"""Build Baseline Location Classifier
Approaches:
- instance representation in form of 'bag' of words 
    - features: word frequencies, metadata 
    - exclude rare words in data set (used by less than 3 users )
- gramatical structure with NLP
- model instances in terms of authors instead of documents 


- Baseline Classifier: Naive Bayes Model ()

"""


"Build Baseline Location Classifier\nApproaches:\n- instance representation in form of 'bag' of words \n    - features: word frequencies, metadata \n    - exclude rare words in data set (used by less than 3 users )\n- gramatical structure with NLP\n- model instances in terms of authors instead of documents \n\n\n- Baseline Classifier: Naive Bayes Model ()\n\n"

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import OneHotEncoder

from os import path, getcwd
from collections import defaultdict
import nltk
import re, string
import validators
from info_gain.info_gain import info_gain_ratio
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
# nltk.download('english')
# nltk.download('punkt')

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
"""
Import data into pandas format from original csv/tsv file formats for 
the top 10/50/100 train/test/dev data 
"""

fdir = path.join(getcwd(), "2019S1-proj2-datah")

# train_data = "train-top10.csv"
# test_data = "test-top10.csv"
# dev_data = "dev-top10.csv"

train_data = "train-raw.tsv"
test_data = "test-raw.tsv"
dev_data = "dev-raw.tsv"

train_fpath = path.join(fdir, train_data)
test_fpath = path.join(fdir, test_data)
dev_fpath = path.join(fdir, dev_data)

train = pd.read_csv(train_fpath, sep="\t", index_col="Instance_ID", encoding="utf_8")

In [8]:
inputs = ['Text']
output = 'Location'
x_train = train[inputs]
y_train = train[output]

In [9]:
class GeoTagger:
    _FEATURE_SELECTION = ["info_gain_ratio", "word_locality_heuristic", 'tf_idf']
    _VOTING_STRATEGY = ["simple_voting", "bagging", "stacking", "random_forest", "boosting"]
    _CLASSIFIERS = ["Zero-R", "One-R", "Decision-Tree", "MultinomialNB", "SVM", "SemiSupervised", "KNN"]
    _EVALUATION_METRIC = ["accuracy", "precision_recall_f-score"]
#     _EVALUATION METHOD = ["Cross-Validation"]
                          
    def __init__(self, inputs, target, classifier_set=["MultinomialNB"], voting_strategy="simple_voting", feature_selection="tf_idf", seed=500):
        self.inputs = inputs
        self.target = target
        self.exclude = set()
        self.classifier_set = classifier_set
        self.voting_strategy = voting_strategy
        self.feature_selection = feature_selection
        self.classifier_set = self._combine_classifier_set(classifier_set)
        self.stemmer = SnowballStemmer('english')
        self.vectorizer = TfidfVectorizer(stop_words=set(stopwords.words('english')))
        np.random.seed(seed)
                
    def _combine_classifier_set(self, classifiers):
        classifier_set = defaultdict()
        
        for classifier in classifiers:
            if not classifier in GeoTagger._CLASSIFIERS:
                print("Invalid Classifier: {}. Choose one of \
                ({})".format(classifier, ", ".join(GeoTagger._CLASSIFIERS)))
                continue
                
            if GeoTagger._CLASSIFIERS.index(classifier) == 0:
                classifier_set[classifier] = DummyClassifier(strategy='most_frequent')
            elif GeoTagger._CLASSIFIERS.index(classifier) == 1:
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=1, criterion="entropy")
            elif GeoTagger._CLASSIFIERS.index(classifier)==2:
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=None, criterion="entropy")
            elif GeoTagger._CLASSIFIERS.index(classifier)==3:
                classifier_set[classifier] = MultinomialNB()
        return classifier_set
        

    def train(self, X, y):
        """
        trains a classifier given the training data and their corresponding class labels
        """
        self.classes = y.unique()
        x = self.preprocess(X, y, train=True)
        
        for classifier in self.classifier_set.values():
            classifier.fit(X, y)
          
    def predict(self, X):
        x = self.preprocess(X)
        predictions = pd.DataFrame()
                
        for name, classifier in self.classifier_set.items():
            classifier_prediction = classifier.predict(x, y)
            predictions[name] = classifier_predictions
            
        return predictions
    
    def evaluate(self, ybar, y, metric):
        #TODO: eval method
        if not metric in GeoTagger._EVALUATION_METRIC:
                print("Invalid Evaluation Metric: {}. Choose one of \
                ({})".format(metric, ", ".join(GeoTagger._EVALUATION_METRIC))) 
                return
            
        if metric == "accuracy":
            score = accuracy_score(ybar, y)
        if metric == "precision_recall_f-score_with_macro":
            score = accuracy_score(ybar, y)
        if metric == "recall":
            score = accuracy_score(ybar, y)            
        
        return score
    
    def preprocess(self, x, y=None, train=False):
        """
         - Filter rare words (urls, typos rare names, punctuation symbols)
         - calculate word frequencies 
         - metadata
        """
        if train:
            x = self.filter(x)
            x = self.feature_selection(x, y)
        else:
            features = self.vectoriser.transform(x[self.inputs])
        return x     
        
    def feature_selection(self, x):
        """
        (1) Information Gain Ratio (IGR) - across all states S, is 
            defined as the ratio between its information gain value IG, 
            which measures the decrease in class entropy H that w brings,
            and its intrinsic entropy IV, which measures the entropy of 
            the presence versus the absence of that word
            
        (2) Word Locality Heuristic (WLH) - promotes words primarily 
            associated with one location. measure the probability of 
            a word occurring in a state, divided by its probability to 
            appear in any state. Then, for a given word w, we define the 
            WLH as the maximum such probability across all the states S
        """
        if self.feature_selection == "IGR":
#             return information_gain_ratio(x)
             return
        elif self.feature_selection == "WLH":
#             return word_locality_weight(x)
            return
        else:
            print("Invalid Feature Selection method: {}. Choose one of \
            ({})".format(self.feature_selection, ", ".join(GeoTagger._FEATURE_SELECTION)))
        
    def word_locality_weight(self, x):
        """
        calculate frequencies of data 
        Measure frequency and divide by sum of freqencies of all words
        """
        pass
    
    def information_gain_ratio(self, x):
#         return info_gain_ratio
        pass
    
    def tf_idf(self, x, y):
        location_word_list = {label: '' for label in self.classes}

        print(x.index[-1], y.index[-1])
        for index in x.index:
            location_word_list[y.iloc[index]] += x.iloc[index] + " "
        print(location_word_list.values())
        features = self.vectorizer.fit_transform(location_word_list.values)
        return x
        
    def filter(self, text):
        return ' '.join(self.filter_word(w) for w in text.split())
        
    def filter_word(self, word):
        word = word.lower()
        # extract keywords from hashtag 
        if self._is_hyperlink(word):
            return ''
        elif self._is_hashtag(word):
            word = self._process_hashtag(word)
        # potentially cross-reference individuals mentioned? or discard
        elif self._is_mention(word):
            word = self._process_mention(word)
        # remove ascii characters 
        else:
            word = self._ascii_to_unicode(word)
            word = self._word_stem(word)
            # .decode("unicode_escape").encode('utf-8')
            word = re.sub(r'[^\w\s]',' ', word)
#         print(word)
        return word
                
    def _is_hashtag(self, word):
        if len(word) == 0:
            return False
        return word[0] == "#"
    
    def _is_mention(self, word):
        if len(word) == 0:
            return False
        return word[0] == "@"
    
    def _is_hyperlink(self, word):
        return validators.url(word)
    
    def _process_hashtag(self, word):
        return word[1:]
    
    def _process_mention(self, word):
        return word[1:]
    
    def _ascii_to_unicode(self, word):
        for uescape in re.findall(r'(\\u[0-9a-z]{4})', word):
            try:
#                 print(uescape.encode('utf-8').decode('unicode-escape'), type(uescape.encode('utf-8').decode('unicode-escape')))
#                 word = re.sub(uescape, uescape.encode('utf-8'), word)
#                 print(word)
#                 print(uescape, type(uescape))
                word = re.sub(uescape, '', word)  
            except (UnicodeDecodeError, Exception):
                print("Failed to decode: {}".format(uescape))
        return word
    
    def _word_stem(self, word):
        return self.stemmer.stem(word)
    

In [11]:
x_train

Unnamed: 0_level_0,Text
Instance_ID,Unnamed: 1_level_1
1,"\ud83c\udf17 @ Melbourne, Victoria, Australia ..."
2,"@theage Of course it costs more, minimum stand..."
3,Hope people make just as much noise as they di...
4,Pouring the perfect Prosecco \ud83e\udd42\ud83...
5,"$LNY losing traction at 0.014, see this retrac..."
6,\u0e44\u0e21\u0e48\u0e44\u0e2b\u0e27\u0e41\u0e...
7,@ashleighjayy_ Me @ this bitch https://t.co/8J...
8,@AnaOLFan I \u2764\ufe0f you - I could never b...
9,@ihatejoelkim Welcome to Australia! Hoping you...
10,Mea evolve conference @markbouris session. I L...


In [10]:
gt = GeoTagger(inputs[0], output)

gt = GeoTagger(
    inputs = inputs[0],
    target = output,
    classifier_set = ["MultinomialNB"], 
    voting_strategy = "simple_voting", 
    seed=500
)

x_features = gt.train(x_train, y_train)

AttributeError: 'DataFrame' object has no attribute 'split'

In [None]:
#predict the class labels of a set of test data
# ybar = self.predict(clf, x_dev)

In [None]:
# evaluate classifier performance
# score = self.evaluate(ybar, dev_y)

### Some Resources
- https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [197]:
clf = MultinomialNB()
print(clf)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
