In [1]:
"""Build Baseline Location Classifier
Approaches:
- instance representation in form of 'bag' of words 
    - features: word frequencies, metadata 
    - exclude rare words in data set (used by less than 3 users )
- gramatical structure with NLP
- model instances in terms of authors instead of documents 


- Baseline Classifier: Naive Bayes Model ()

"""


"Build Baseline Location Classifier\nApproaches:\n- instance representation in form of 'bag' of words \n    - features: word frequencies, metadata \n    - exclude rare words in data set (used by less than 3 users )\n- gramatical structure with NLP\n- model instances in terms of authors instead of documents \n\n\n- Baseline Classifier: Naive Bayes Model ()\n\n"

In [166]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from os import path, getcwd
from collections import defaultdict
import preprocessor as p
import nltk
from nltk.tokenize import word_tokenize 
import re, string
from validators import url
from info_gain.info_gain import info_gain_ratio
# nltk.download('english')
# nltk.download('punkt')

In [167]:
fdir = path.join(getcwd(), "2019S1-proj2-datah")

train_data = "train-raw.tsv"
test_data = "test-raw.tsv"
dev_data = "dev-raw.tsv"

train_fpath = path.join(fdir, train_data)

train = pd.read_csv(train_fpath, sep="\t", index_col="Instance_ID")
train.head(50)

Unnamed: 0_level_0,Location,Text
Instance_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Melbourne,"\ud83c\udf17 @ Melbourne, Victoria, Australia ..."
2,Melbourne,"@theage Of course it costs more, minimum stand..."
3,Brisbane,Hope people make just as much noise as they di...
4,Perth,Pouring the perfect Prosecco \ud83e\udd42\ud83...
5,Perth,"$LNY losing traction at 0.014, see this retrac..."
6,Melbourne,\u0e44\u0e21\u0e48\u0e44\u0e2b\u0e27\u0e41\u0e...
7,Melbourne,@ashleighjayy_ Me @ this bitch https://t.co/8J...
8,Melbourne,@AnaOLFan I \u2764\ufe0f you - I could never b...
9,Sydney,@ihatejoelkim Welcome to Australia! Hoping you...
10,Brisbane,Mea evolve conference @markbouris session. I L...


In [163]:
inputs = ['Text']
output = 'Location'
x = train[inputs]
y = train[output]

In [164]:
class GeoTagger:
    def __init__(self, classifier="MultinomialNB", feature_selection="IGR"):
        self.exclude = set()
        self.classifier = classifier # MultinomialNB, SVM
        self.stopwords = set(stopwords.words('english'))
        
    def train(self, x, y):
        self.classes = y.unique()
        x = self.preprocess(x, train=True)
        return x
    
    def test(self, x, y):
#         self.preprocess(x, y)
        pass
    
    def predict(self, x):
#         self.preprocess(x, y)
        pass
    
    def evaluate(self, x, y):
#         y_pred = self.predict(y)
#         cross validation
        pass
    
    def preprocess(self, x, train=False):
        """
         - Filter rare words (urls, typos rare names, punctuation symbols)
         - calculate word frequencies 
         - metadata
        """
#         if train:
#             feature_weight(x)
#             feature_selection(x)
#       self.class_feature_sets = {label: defaultdict() for label in self.classes}
        x = self.filter(x)
        return x
    
    def filter(self, x):
        """
        https://towardsdatascience.com/extracting-twitter-data-pre-processing-and-sentiment-analysis-using-python-3-0-7192bd8b47cf
        """
        x['Text'] = x['Text'].apply(self.filter_tweet)
        return x        
        
    def feature_selection(self, x):
        """
        (1) Information Gain Ratio (IGR) - across all states S, is 
            defined as the ratio between its information gain value IG, 
            which measures the decrease in class entropy H that w brings,
            and its intrinsic entropy IV, which measures the entropy of 
            the presence versus the absence of that word
            
        (2) Word Locality Heuristic (WLH) - promotes words primarily 
            associated with one location. measure the probability of 
            a word occurring in a state, divided by its probability to 
            appear in any state. Then, for a given word w, we define the 
            WLH as the maximum such probability across all the states S
        """
        if self.feature_selection == "IGR":
#             return information_gain_ratio(x)
        elif self.feature_selection == "WLH":
            return word_locality_weight(x)
        else:
            print("Invalid Feature Selection method: {}. Choose one of ('IGR', 'WLH')".format(self.feature_selection))
        
    def word_locality_weight(self, x):
        """
        calculate frequencies of data 
        Measure frequency and divide by sum of freqencies of all words
        """
    
    def information_gain_ratio(self, x):
#         return info_gain_ratio
        pass
        
    def filter_tweet(self, tweet):
        filtered = [self.filter_word(w) for w in tweet.split() if w not in self.stopwords]
        return ' '.join(filtered)
        
    def filter_word(self, word):
        # extract keywords from hashtag 
        if self._is_hyperlink(word):
            return ''
        elif self._is_hashtag(word):
            word = self._process_hashtag(word)
        # potentially cross-reference individuals mentioned? or discard
        elif self._is_mention(word):
            word = self._process_mention(word)
        # remove ascii characters 
        else:
            word = re.sub(r'[^\w\s]','', word)
            
        return word.lower()
                
    def _is_hashtag(self, word):
        if len(word) == 0:
            return False
        return word[0] == "#"
    
    def _is_mention(self, word):
        if len(word) == 0:
            return False
        return word[0] == "@"
    
    def _is_hyperlink(self, word):
        return url(word)
    
    def _process_hashtag(self, word):
        return word
    
    def _process_mention(self, word):
        return word
    

In [165]:
gt = GeoTagger()
x = gt.train(x, y)
print(x['Text'])

Instance_ID
1                ud83cudf17 @ melbourne victoria australia 
2         @theage of course costs more minimum standards...
3         hope people make much noise tpj shot  thats de...
4         pouring perfect prosecco ud83eudd42ud83cudf7e ...
5         lny losing traction 0014 see retracing back 00...
6         u0e44u0e21u0e48u0e44u0e2bu0e27u0e41u0e25u0e49u...
7                                @ashleighjayy_ me @ bitch 
8         @anaolfan i u2764ufe0f  i could never offended xx
9         @ihatejoelkim welcome australia hoping wonderf...
10        mea evolve conference @markbouris session i le...
11                               first taste winter autumn 
12        u0643u064au0641 u064au0634u0648u0641 u0627u063...
13        #selfiesunday #lost #tabby amp whitecaramel #c...
14                 @artsdesire give credit looks incredible
15        @pringster78 @chadderbox2018 i kind opposite b...
16                @gramercypark itu2019s great isnu2019t it
17                          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
