In [380]:
import pandas as pd
import numpy as np
import re, string
from os import path, getcwd
from collections import defaultdict
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2

from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier

from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_predict

import validators
import nltk
from info_gain.info_gain import info_gain_ratio
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression


In [200]:
import csv
fdir = path.join(getcwd(), "2019S1-proj2-datah")

train_data = "train-raw.tsv"
test_data = "test-raw.tsv"
dev_data = "dev-raw.tsv"

train_fpath = path.join(fdir, train_data)
test_fpath = path.join(fdir, test_data)
dev_fpath = path.join(fdir, dev_data)

train = pd.read_csv(train_fpath, encoding="utf_8", delimiter="\t", index_col="Instance_ID",  quoting=csv.QUOTE_NONE, error_bad_lines=False)
test = pd.read_csv(test_fpath, encoding="utf_8", delimiter="\t" , index_col="Instance_ID", quoting=csv.QUOTE_NONE, error_bad_lines=False)
dev = pd.read_csv(dev_fpath, encoding="utf_8", delimiter="\t" , index_col="Instance_ID")

In [201]:
inputs = 'Text'
output = 'Location'
x_train = train[inputs]
y_train = train[output]
x_test = test[inputs]
y_test = test[output]
x_dev = dev[inputs]
y_dev = dev[output]

In [251]:
class GeoTagger:
    _FEATURE_SELECTION = ["baseline_10", "baseline_50", "baseline_100", "info_gain_ratio", "word_locality_heuristic", "tf_idf"]
    _ENSEMBLE_STRATEGY = ["simple_voting", "meta_classification", "bagging", "random_forest", "boosting"]
    _CLASSIFIERS = ["Zero-R", "One-R", "Decision-Tree", "MultinomialNB", "LinearSVM", "SemiSupervised"]
    _EVALUATION_METRIC = ["accuracy", "precision_recall_f-score_with_macro", "precision_recall_f-score_with_micro"]
    
    def __init__(self, inputs, target, classifier_set=["MultinomialNB"], ensemble_strategy="simple_voting", feature_selection_method="baseline_100", seed=500, combine_classifiers=False, n_features=400):
        self.n_features = n_features
        self.inputs = inputs
        self.target = target
        self.classifier_set = classifier_set
        self.ensemble_strategy = ensemble_strategy
        self.feature_selection_method = feature_selection_method
        self.seed = seed
        np.random.seed(seed)
        self.stemmer = SnowballStemmer('english')
        self.stop_words = set(stopwords.words('english'))
        self.combine_classifiers = combine_classifiers
        self.classifier_set = self._generate_classifier_set(classifier_set)
        self.combined_classifier = None if not self.combine_classifiers else self.generate_ensemble_classifier()

    def train(self, X, y):
        """
        trains a classifier given the training data and their corresponding class labels
        """
        self.classes = y.unique()
        X = self.preprocess(X, y, train=True)
        
        if self.combine_classifiers:
            self.combined_classifier.fit(X, y)
        else:
            for classifier in self.classifier_set.values():
                classifier.fit(X, y)

    def predict(self, X):
        """
        predicts a set of classifiers given some development data
        """
        X = self.preprocess(X)
        print(X)
        
        if self.combine_classifiers:
            return self.combined_classifier.predict(X)
        else:
            y_set = pd.DataFrame()

            for name, classifier in self.classifier_set.items():
                classifier_prediction = classifier.predict(X)
                y_set[name] = classifier_prediction
                
        return y_set
        
    
    def evaluate(self, ybar, y):
        """
        evaluates a class' predictions given the correct class labels and an evaluation metric
        """
#         if not metric in GeoTagger._EVALUATION_METRIC:
#             print("Invalid Evaluation Metric: {}. Choose one of \
#             ({})".format(metric, ", ".join(GeoTagger._EVALUATION_METRIC)))
#             return
            
        score_set = defaultdict()
        
        if self.combine_classifiers:
            classifiers = [self.ensemble_strategy, ]
            ybar = pd.DataFrame(ybar, columns=classifiers, index = y.index)
        else:
            classifiers = self.classifier_set

        for name, y_pred in ybar.items():
            accuracy = accuracy_score(y, y_pred)
            score_set[name] = accuracy 
#             report = classification_report(ybar, y, self.target)
#             confusion = metrics.confusion_matrix(y_test, y_pred_class)

        return score_set
    
    def cross_validation(self, X, y, metric):
        score_set = defaultdict()
        
        X = self.preprocess(X)

        if self.combine_classifiers:
            classifiers = [self.ensemble_strategy, ]
            ybar = pd.DataFrame(ybar, columns=classifiers, index = y.index)
        else:
            classifiers = self.classifier_set

        for classifier in classifiers:
            score_set[metric] = cross_validate(classifier, X, y, cv=10)
        return score_set
    
    
    def preprocess(self, X, y=None, train=False):
        """
         - Filter rare words (urls, typos rare names, punctuation symbols)
         - calculate word frequencies 
         - metadata
        """
        X = self.filter(X)

        if train:
            self.feature_selection(X, y)
        
        X = self.bag_of_words(X)
            
        return X 
    
    def bag_of_words(self, X):
        _x = pd.DataFrame(
            [[(word in text) for word in sorted(list(self.features))] for text in X.values],
            index=X.index, 
            columns=self.features,
            dtype=np.uint8
        )        
        return _x
    
    def generate_ensemble_classifier(self):
        if not self.combine_classifiers:
            return None
        
        if not self.ensemble_strategy in GeoTagger._ENSEMBLE_STRATEGY:
            print("Invalid Ensemble Strategy Metric: {}. Choose one of \
            ({})".format(metric, ", ".join(GeoTagger._ENSEMBLE_STRATEGY)))
            return None
        
        if self.ensemble_strategy == "simple_voting":
            combined_classifier = VotingClassifier(self.classifier_set.items(), 'hard')
        elif self.ensemble_strategy == "meta_classification":
            combined_classifier = MetaClassifier(self.classifier_set.items(), self.seed)
        elif self.ensemble_strategy == "bagging":
            base_classifier = DecisionTreeClassifier(max_features=None, max_leaf_nodes=999)
            combined_classifier = BaggingClassifier(base_estimator=base_classifier, max_features=self.n_features, random_state=self.seed)
        elif self.ensemble_strategy == "random_forest":
            combined_classifier = RandomForestClassifier()
        elif self.ensemble_strategy == "boosting":
            combined_classifier = GradientBoostingClassifier()
        
        return combined_classifier
        
    
    def _generate_classifier_set(self, classifiers):
        classifier_set = defaultdict()
        
        for classifier in classifiers:
            if not classifier in GeoTagger._CLASSIFIERS:
                print("Invalid Classifier: {}. Choose one of \
                ({})".format(classifier, ", ".join(GeoTagger._CLASSIFIERS)))
                continue
                
            if classifier == "Zero-R":
                classifier_set[classifier] = DummyClassifier(strategy='most_frequent', random_state=self.seed)
            elif classifier == "One-R":
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=1, criterion="entropy", random_state=self.seed)
            elif classifier == "Decision-Tree":
                classifier_set[classifier] = DecisionTreeClassifier(max_depth=None, criterion="entropy", random_state=self.seed)
            elif classifier == "MultinomialNB":
                classifier_set[classifier] = MultinomialNB()
            elif classifier == "LinearSVM":
                classifier_set[classifier] = svm.LinearSVC(random_state=self.seed)
            elif classifier == "SemiSupervised":
                classifier_set[classifier] = LabelSpreading(kernel="knn", n_neighbors=7, alpha=0.2)
        return classifier_set
                
    def feature_selection(self, X, y):
        """
        (1) Information Gain Ratio (IGR) - across all states S, is 
            defined as the ratio between its information gain value IG, 
            which measures the decrease in class entropy H that w brings,
            and its intrinsic entropy IV, which measures the entropy of 
            the presence versus the absence of that word
            
        (2) Word Locality Heuristic (WLH) - promotes words primarily 
            associated with one location. measure the probability of 
            a word occurring in a state, divided by its probability to 
            appear in any state. Then, for a given word w, we define the 
            WLH as the maximum such probability across all the states S
        """
        if self.feature_selection_method not in GeoTagger._FEATURE_SELECTION:
            print("Invalid Feature Selection method: {}. Choose one of \
            ({})".format(self.feature_selection_method, ", ".join(GeoTagger._FEATURE_SELECTION)))
            return 
        
        if self.feature_selection_method == "baseline_10":
            self.baseline_heuristic(X, y, "10")
        elif self.feature_selection_method == "baseline_50":
            self.baseline_heuristic(X, y, "50")
        elif self.feature_selection_method == "baseline_100":
            self.baseline_heuristic(X, y, "100")
        elif self.feature_selection_method == "info_gain_ratio":
#             self.information_gain_ratio(x)
            return
        elif self.feature_selection_method == "word_locality_heuristic":
            self.word_locality_weight(X, y)
        elif self.feature_selection_method == "tf_idf":
            self.term_frequency_inverse_city_frequency(X, y)

    def baseline_heuristic(self, X, y, top_n):
        feature_fpath = path.join(fdir, "train-top" + top_n + ".csv")
        
        if not path.exists(feature_fpath):
            print("Baseline Heuristic path {} does not exist".format(feature_fpath))
            return
        
        features = open(feature_fpath).readline()
        features = features.split(",")
        features.remove("Instance_ID")
        features.remove("Location\n")
        self.features = set(features)

    def word_locality_weight(self, X, y):
        """
        calculate frequencies of data 
        Measure frequency and divide by sum of freqencies of all words
        """
        locations = self.classes + ['Total',]
        word_locality_features = {label: defaultdict() for label in locations}
        word_locality_weight = {label: defaultdict() for label in self.classes}
        
        for x_i, y_i in zip(X.index, y.index):
            text = X.loc[x_i].split()
            for word in text:
                word_locality_features[y.loc[y_i]][word] += 1
                word_locality_features[y.loc[y_i]]['Total'] += 1
                word_locality_features['Total'][word] += 1
                word_locality_features['Total']['Total'] += 1
        
        for label in self.classes:
            for word in word_locality_features[label].keys():
                cond_word_prob = word_locality_features[label][word] / word_locality_features[label]['Total']
                word_prob = word_locality_features['Total'][word] / word_locality_features['Total']['Total']
                word_locality_weight[label][word] = cond_word_prob / word_prob
        
        features = []
        for location in self.classes:
            n_location_features = int(self.n_features / len(self.classes))
            features.append(sorted(word_locality_weight[location].items(), key=lambda kv: kv[1], reverse=True)[:n_location_features]) 
        
        self.features = set([feature[0] for feature in features])
                
    
    def information_gain_ratio(self, x):
#         return info_gain_ratio
        pass
    
    def term_frequency_inverse_city_frequency(self, X, y):
        vectorizer = TfidfVectorizer(stop_words=self.stop_words, max_features=self.n_features)

        location_word_list = {label: '' for label in self.classes}

        for x_i, y_i in zip(X.index, y.index):
            location_word_list[y.loc[y_i]] += X.loc[x_i] + " "
        
        labels = location_word_list.keys()
        corpus = location_word_list.values()
        vectorizer.fit(corpus, labels)
        self.features = set(vectorizer.get_feature_names())
        
    def filter(self, x):
        return x.apply(self.filter_text)
    
    def filter_text(self, text):
        return ' '.join(self.filter_word(w) for w in text.split())
        
    def filter_word(self, word):
        word = word.lower()
        # extract keywords from hashtag 

        if self._is_hyperlink(word):
            return ''
        elif self._is_hashtag(word):
            word = self._process_hashtag(word)
        # potentially cross-reference individuals mentioned? or discard
        elif self._is_mention(word):
            word = self._process_mention(word)
        # remove ascii characters 
        else:
            word = self._ascii_to_unicode(word)
            
            word = re.sub(r'[^\w\s]',' ', word)
#             word = self._word_stem(word)
            if word in self.stop_words:
                return ''
        return word
                
    def _is_hashtag(self, word):
        if len(word) == 0:
            return False
        return word[0] == "#"
    
    def _is_mention(self, word):
        if len(word) == 0:
            return False
        return word[0] == "@"
    
    def _is_hyperlink(self, word):
        return validators.url(word)
    
    def _process_hashtag(self, word):
        return word[1:]
    
    def _process_mention(self, word):
        return word[1:]
    
    def _ascii_to_unicode(self, word):
        for uescape in re.findall(r'(\\u[0-9a-f]{4})', word):
            try:
                word = word.replace(uescape, '')  
            except (UnicodeDecodeError, Exception):
                print("Failed to decode: {}".format(uescape))
        return word
    
    def _word_stem(self, word):
        return self.stemmer.stem(word)

In [378]:
class MetaClassifier:
    def __init__(self, estimators, random_state):
        self.estimators = estimators
        self.encoder = LabelEncoder()
        self.random_state = random_state
        self.meta_classifier = LogisticRegression(random_state=self.random_state, solver='lbfgs',multi_class='multinomial')
        np.random.seed(random_state)
    
    def fit(self, X, y):
        #y_set = pd.DataFrame(index=X.index, columns=[item[0] for item in self.estimators])
        
        MNB_meta = []
        SVM_meta = []
        DT_meta = []
        skf = KFold(10, False)
        num = 0
        for train, test in skf.split(X, y):
            for name, classifier in self.estimators:
                y_val = y[train].tolist()
                if name == "Decision-Tree":
                    print("DT")
                    print(train)
                    clf = DecisionTreeClassifier(max_depth=None, criterion="entropy", random_state=self.random_state)
                    clf.fit(X.iloc[train], y_val)
                    DT_meta+= clf.predict(X.iloc[test]).tolist()
                elif name == "MultinomialNB":
                    print("MNB")
                    print(train)
                    clf = MultinomialNB()
                    clf.fit(X.iloc[train], y_val)
                    MNB_meta+= clf.predict(X.iloc[test]).tolist()
                elif name == "LinearSVM":
                    print("LinearSVM")
                    print(train)
                    clf = svm.LinearSVC(random_state=self.random_state)
                    clf.fit(X.iloc[train], y_val)
                    SVM_meta+= clf.predict(X.iloc[test]).tolist()
                elif name == "One-R":
                    clf = DecisionTreeClassifier(max_depth=1, criterion="entropy", random_state=self.random_state)
                    clf.fit(X.iloc[train], y_val)
                    SVM_meta+= clf.predict(X.iloc[test]).tolist()
            num+=1
            print(num)

        #make dataframe m1,m2,m3 on metas
        d = {}
        for name, classifier in self.estimators:
            if name=="Decision-Tree":
                d["Decision-Tree-Metas"] = DT_meta
            elif name=="MultinomialNB":
                d["MultinomialNB-Metas"] = MNB_meta
            elif name=="LinearSVM":
                d["LinearSVM-Metas"] = SVM_meta
        train_meta = pd.DataFrame(d)
#         print(train_meta)
#         print(y)
#         print(len(train_meta))
#         print(len(y))
#         print(train_meta.apply(LabelEncoder().fit_transform))
        encoded_train = pd.DataFrame(self.encoder.fit_transform(train_meta.values.ravel()).reshape(train_meta.shape))
        self.meta_classifier.fit(encoded_train, y)
        
        self.x_train = X
        self.y_train = y
        
    def predict(self, X):
        d = {} 
        #train entire X on all relevant classifiers
        for name, classifier in self.estimators:
            classifier.fit(self.x_train, self.y_train)
            d[name] = classifier.predict(X)
        test_meta = pd.DataFrame(d)
        print(test_meta)
        encoded_test = pd.DataFrame(self.encoder.fit_transform(test_meta.values.ravel()).reshape(test_meta.shape))
        return  self.meta_classifier.predict(encoded_test)

In [381]:
classifier_set = ["LinearSVM", "Decision-Tree", "MultinomialNB"]
print(classifier_set)
voting_strategy = "meta_classification"
combine_classifiers = True

gt = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "tf_idf",
    n_features = 400,
    seed = 500,
    combine_classifiers = combine_classifiers
)

start = datetime.now()
gt.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt2 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "baseline_10",
    n_features = 34,
    seed = 500,
    combine_classifiers = combine_classifiers
)

start = datetime.now()
gt2.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt3 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "baseline_50",
    n_features = 185,
    seed = 500,
    combine_classifiers = combine_classifiers
)

start = datetime.now()
gt3.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))

gt4 = GeoTagger(
    inputs = inputs,
    target = output,
    classifier_set = classifier_set,
    ensemble_strategy = voting_strategy,
    feature_selection_method = "baseline_100",
    n_features = 366,
    seed = 500,
     combine_classifiers = combine_classifiers
)

start = datetime.now()
gt4.train(x_train, y_train)
end = datetime.now()
print("Time taken: {}".format(end - start))



# MNB = GeoTagger(
#     inputs = inputs,
#     target = output,
#     classifier_set = ["MultinomialNB"],
#     ensemble_strategy = voting_strategy,
#     feature_selection_method = "baseline_100",
#     n_features = 366,
#     seed = 500,
#     combine_classifiers = combine_classifiers
# )

# start = datetime.now()
# MNB.train(x_train, y_train)
# end = datetime.now()
# print("Time taken: {}".format(end - start))

# LSVM = GeoTagger(
#     inputs = inputs,
#     target = output,
#     classifier_set = ["LinearSVM"],
#     ensemble_strategy = voting_strategy,
#     feature_selection_method = "baseline_100",
#     n_features = 366,
#     seed = 500,
#     combine_classifiers = False
# )

# start = datetime.now()
# LSVM.train(x_train, y_train)
# end = datetime.now()
# print("Time taken: {}".format(end - start))


# DT = GeoTagger(
#     inputs = inputs,
#     target = output,
#     classifier_set = ["Decision-Tree"],
#     ensemble_strategy = voting_strategy,
#     feature_selection_method = "baseline_100",
#     n_features = 366,
#     seed = 500,
#      combine_classifiers = False
# )

# start = datetime.now()
# DT.train(x_train, y_train)
# end = datetime.now()
# print("Time taken: {}".format(end - start))

['LinearSVM', 'Decision-Tree', 'MultinomialNB']
LinearSVM
[ 10337  10338  10339 ... 103361 103362 103363]
DT
[ 10337  10338  10339 ... 103361 103362 103363]
MNB
[ 10337  10338  10339 ... 103361 103362 103363]
1
LinearSVM
[     0      1      2 ... 103361 103362 103363]


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


DT
[     0      1      2 ... 103361 103362 103363]
MNB
[     0      1      2 ... 103361 103362 103363]
2
LinearSVM
[     0      1      2 ... 103361 103362 103363]
DT
[     0      1      2 ... 103361 103362 103363]
MNB
[     0      1      2 ... 103361 103362 103363]
3
LinearSVM
[     0      1      2 ... 103361 103362 103363]
DT
[     0      1      2 ... 103361 103362 103363]
MNB
[     0      1      2 ... 103361 103362 103363]
4
LinearSVM
[     0      1      2 ... 103361 103362 103363]
DT
[     0      1      2 ... 103361 103362 103363]
MNB
[     0      1      2 ... 103361 103362 103363]
5
LinearSVM
[     0      1      2 ... 103361 103362 103363]
DT
[     0      1      2 ... 103361 103362 103363]
MNB
[     0      1      2 ... 103361 103362 103363]
6
LinearSVM
[     0      1      2 ... 103361 103362 103363]
DT
[     0      1      2 ... 103361 103362 103363]
MNB
[     0      1      2 ... 103361 103362 103363]
7
LinearSVM
[     0      1      2 ... 103361 103362 103363]
DT
[     0      1     

In [385]:
#predict the class labels of a set of test data
ybars = gt.predict(x_dev)
ybars2 = gt2.predict(x_dev)
ybars3 = gt3.predict(x_dev)
ybars4 = gt4.predict(x_dev)

# MNBbar = MNB.predict(x_dev)
# LSVMbar = LSVM.predict(x_dev)
# DTbar = DT.predict(x_dev)

             club  guess  guy  soon  getting  used  fuck  20  round  far  \
Instance_ID                                                                
21              0      0    0     0        0     0     0   0      0    0   
22              0      0    0     0        0     0     0   0      0    0   
23              0      0    0     0        0     0     0   0      0    0   
24              0      0    0     0        0     0     0   0      0    0   
25              0      0    0     0        0     0     0   0      0    0   
26              0      0    0     0        1     0     0   0      0    0   
27              1      0    0     0        0     0     0   0      0    0   
28              0      0    0     0        0     0     0   0      0    0   
29              0      0    0     0        0     0     0   0      0    0   
210             0      0    0     0        0     0     0   0      0    0   
211             0      0    0     0        0     0     0   0      0    0   
212         

      Decision-Tree  LinearSVM MultinomialNB
0            Sydney     Sydney      Brisbane
1            Sydney     Sydney         Perth
2          Brisbane   Brisbane      Brisbane
3             Perth     Sydney        Sydney
4          Brisbane   Brisbane      Brisbane
5            Sydney  Melbourne     Melbourne
6         Melbourne  Melbourne     Melbourne
7            Sydney     Sydney      Brisbane
8            Sydney   Brisbane      Brisbane
9          Brisbane  Melbourne        Sydney
10         Brisbane   Brisbane         Perth
11         Brisbane   Brisbane        Sydney
12            Perth     Sydney        Sydney
13         Brisbane   Brisbane      Brisbane
14           Sydney  Melbourne     Melbourne
15        Melbourne   Brisbane      Brisbane
16           Sydney     Sydney        Sydney
17            Perth     Sydney        Sydney
18        Melbourne   Brisbane        Sydney
19            Perth      Perth         Perth
20           Sydney   Brisbane      Brisbane
21        

      Decision-Tree  LinearSVM MultinomialNB
0         Melbourne  Melbourne      Brisbane
1         Melbourne  Melbourne      Brisbane
2         Melbourne  Melbourne      Brisbane
3         Melbourne  Melbourne      Brisbane
4          Brisbane   Brisbane      Brisbane
5         Melbourne  Melbourne      Brisbane
6          Brisbane     Sydney     Melbourne
7         Melbourne  Melbourne      Brisbane
8         Melbourne  Melbourne      Brisbane
9         Melbourne  Melbourne      Brisbane
10        Melbourne  Melbourne      Brisbane
11        Melbourne  Melbourne      Brisbane
12        Melbourne  Melbourne      Brisbane
13        Melbourne  Melbourne      Brisbane
14        Melbourne  Melbourne      Brisbane
15        Melbourne  Melbourne      Brisbane
16        Melbourne  Melbourne      Brisbane
17        Melbourne  Melbourne      Brisbane
18        Melbourne  Melbourne      Brisbane
19        Melbourne  Melbourne      Brisbane
20        Melbourne  Melbourne      Brisbane
21        

      Decision-Tree  LinearSVM MultinomialNB
0            Sydney     Sydney      Brisbane
1            Sydney     Sydney      Brisbane
2         Melbourne  Melbourne      Brisbane
3            Sydney     Sydney      Brisbane
4          Brisbane   Brisbane      Brisbane
5            Sydney     Sydney      Brisbane
6          Brisbane     Sydney     Melbourne
7            Sydney     Sydney      Brisbane
8         Melbourne  Melbourne        Sydney
9         Melbourne  Melbourne      Brisbane
10           Sydney     Sydney      Brisbane
11           Sydney     Sydney        Sydney
12           Sydney  Melbourne     Melbourne
13        Melbourne  Melbourne        Sydney
14         Brisbane      Perth     Melbourne
15        Melbourne  Melbourne      Brisbane
16           Sydney  Melbourne     Melbourne
17           Sydney     Sydney      Brisbane
18           Sydney  Melbourne     Melbourne
19           Sydney     Sydney      Brisbane
20           Sydney     Sydney      Brisbane
21        

      Decision-Tree  LinearSVM MultinomialNB
0            Sydney     Sydney        Sydney
1            Sydney     Sydney        Sydney
2         Melbourne  Melbourne      Brisbane
3            Sydney     Sydney        Sydney
4          Brisbane   Brisbane      Brisbane
5          Brisbane   Brisbane      Brisbane
6         Melbourne     Sydney     Melbourne
7            Sydney     Sydney        Sydney
8            Sydney   Brisbane     Melbourne
9         Melbourne  Melbourne      Brisbane
10           Sydney     Sydney        Sydney
11        Melbourne  Melbourne     Melbourne
12         Brisbane  Melbourne        Sydney
13        Melbourne   Brisbane        Sydney
14         Brisbane      Perth     Melbourne
15        Melbourne  Melbourne      Brisbane
16           Sydney  Melbourne        Sydney
17         Brisbane  Melbourne      Brisbane
18           Sydney     Sydney        Sydney
19         Brisbane      Perth         Perth
20         Brisbane   Brisbane      Brisbane
21        

In [377]:
M = MNBbar["MultinomialNB"].tolist()
L = LSVMbar["LinearSVM"].tolist()
D = DTbar["Decision-Tree"].tolist()

count = 0
for i in range(len(M)):
    if L[i]==D[i]:
        count+=1
print(count, len(M))

18533 37300


predicts 7076 correct out of 18553 instances that are similar

In [386]:
# models = [gt, gt2, gt3, gt4]
# predictions = [ybars, ybars2, ybars3, ybars4]
# evals = ["accuracy", "precision_recall_f-score_with_micro"]

# for model, prediction in zip(models, predictions):
#     for method in evals:
#         report, confusion = model.evaluate(prediction, y_dev, method)
#         print("{}: {}".format(method, report, ))
accScores = gt.evaluate(ybars, y_dev)
# otherScores = gt.evaluate(ybars, y_dev, "precision_recall_f-score_with_micro")
accScores2 = gt2.evaluate(ybars2, y_dev)
# # otherScores2 = gt2.evaluate(ybars2, y_dev, "precision_recall_f-score_with_micro")
accScores3 = gt3.evaluate(ybars3, y_dev)
# # otherScores3 = gt3.evaluate(ybars3, y_dev, "precision_recall_f-score_with_micro")
accScores4 = gt4.evaluate(ybars4, y_dev)
# otherScores4 = gt4.evaluate(ybars4, y_dev, "precision_recall_f-score_with_micro")

In [387]:
print(accScores, accScores2, accScores3, accScores4)

defaultdict(None, {'meta_classification': 0.27152815013404824}) defaultdict(None, {'meta_classification': 0.252627345844504}) defaultdict(None, {'meta_classification': 0.23689008042895443}) defaultdict(None, {'meta_classification': 0.2511260053619303})


In [388]:
print(np.unique(ybars, return_counts=True))
print(np.unique(ybars2, return_counts=True))
print(np.unique(ybars3, return_counts=True))
print(np.unique(ybars4, return_counts=True))

(array(['Brisbane', 'Melbourne', 'Perth', 'Sydney'], dtype=object), array([16136,   811, 14873,  5480], dtype=int64))
(array(['Brisbane', 'Perth', 'Sydney'], dtype=object), array([ 1222,  7308, 28770], dtype=int64))
(array(['Brisbane', 'Melbourne', 'Perth', 'Sydney'], dtype=object), array([16066,  9845,  1359, 10030], dtype=int64))
(array(['Brisbane', 'Melbourne', 'Perth', 'Sydney'], dtype=object), array([ 9291,  1437, 16978,  9594], dtype=int64))


### Some Resources
- https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34
- http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/

In [138]:
# testPreds = gt.predict(x_test)
testPreds4 = gt4.predict(x_test)

             chrismurphys  melissa  sextoys  credits  qldpremierrugby  \
Instance_ID                                                             
31                      0        0        0        0                0   
32                      0        0        0        0                0   
33                      0        0        0        0                0   
34                      0        0        0        0                0   
35                      0        0        0        0                0   
36                      0        0        0        0                0   
37                      0        0        0        0                0   
38                      0        0        0        0                0   
39                      0        0        0        0                0   
310                     0        0        0        0                0   
311                     0        0        0        0                0   
312                     0        0        0        

In [139]:
# from pandas import Dataframe
testBagging100 = test.copy()
testBagging100[output] = testPreds4
testOutput = pd.DataFrame(testBagging100[output])
testOutput.columns = ['Class']
export_csv = testOutput.to_csv ('testBagging100.csv',index = True, header=True)

In [123]:
len(testPreds4)

108148

In [124]:
len(testOutput)

108148

In [140]:
ybars

array(['Perth', 'Sydney', 'Brisbane', ..., 'Brisbane', 'Melbourne',
       'Perth'], dtype=object)

In [None]:
skf = KFold(10, False)

tt = {}
num = 0
for train, test in skf.split(x_train, y_train):
    tt[num] = y_train.index[train]
    print(test)
    num+=1

In [272]:
clf = MultinomialNB()
cla = MultinomialNB()

In [303]:
x = [[1],[2],[3]]
y = [3,2,1]
clf.fit(x, y)
clf.predict(x)

array([1, 1, 1])

In [325]:
tt[2].tolist()
# y_train

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [209]:
count = 0
for i in range(num):
    count += len(tt[i])
count

103364

In [210]:
len(x_train)

103364

In [285]:
tt[2]

Instance_ID
1         "\ud83c\udf17 @ Melbourne, Victoria, Australia...
2         "@theage Of course it costs more, minimum stan...
3         "Hope people make just as much noise as they d...
4         "Pouring the perfect Prosecco \ud83e\udd42\ud8...
5         "$LNY losing traction at 0.014, see this retra...
6         "\u0e44\u0e21\u0e48\u0e44\u0e2b\u0e27\u0e41\u0...
7         "@ashleighjayy_ Me @ this bitch https://t.co/8...
8         "@AnaOLFan I \u2764\ufe0f you - I could never ...
9         "@ihatejoelkim Welcome to Australia! Hoping yo...
10        "Mea evolve conference @markbouris session. I ...
11        "First taste of Winter in Autumn https://t.co/...
12        "\u0643\u064a\u0641 \u064a\u0634\u0648\u0641 \...
13        "#SelfieSunday #Lost #Tabby &amp; White/Carame...
14        "@artsdesire Give her credit, she looks incred...
15        "@Pringster78 @Chadderbox2018 I kind of do the...
16           "@gramercypark It\u2019s great isn\u2019t it?"
17        "https://t.co/RT37

In [235]:
x = [1,2,3]
y = [2,3,4]
myarray = np.asarray(x)
ymyarray = np.asarray(y)
np.concatenate((np.array(x), np.array(y)))

array([1, 2, 3, 2, 3, 4])

In [290]:
#alternative
#y = df['y_variable'].astype(int)
#labels_lst = labels.tolist()
#
#y_pred = cross_val_predict(MultinomialNB(), X, y, cv=2)

In [None]:
if 1==1==1:
    print("kek")