# Regression

## Preparation

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, ParameterGrid, cross_validate
import nltk
from nltk.tokenize import word_tokenize
from sklearn.utils import shuffle
from sklearn import decomposition
import pandas as pd
from copy import deepcopy
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import json
import itertools
import random
import warnings
import string
from shutil import copyfile
warnings.filterwarnings("ignore", category=UserWarning)

from functions import *


In [2]:
global_shuffle_seed = 4
global_debug=True
global_override=True

In [3]:
df_data = pd.read_json("/Users/jiaying/Desktop/6506 DSPP/Data Assignment #2/data/classifier/data/data.json")
df_data = df_data[df_data['is_flood'].notna()]
data_true = query_dataframe(df_data, {'is_flood':True})
data_false = query_dataframe(df_data, {'is_flood':False})
print('Total:',len(df_data),'True:',len(data_true), 'False:',len(data_false))


Total: 1380 True: 663 False: 717


## Preprocess Data

In [4]:
custom_stop_words = set(['date', 'published'])
stop_words = set(nltk.corpus.stopwords.words('english'))
punctuations = set(string.punctuation)
all_stop_words = stop_words.union(punctuations, custom_stop_words)
def preprocess(x):
    x = re.sub('[^a-z\s]', ' ', x.lower())
    x = [w for w in x.split() if w not in all_stop_words and len(w)>3]
    return ' '.join(x)


In [5]:
df_data['org_text'] = df_data['text']
df_data['text'] = df_data['text'].apply(preprocess)


In [6]:
df_data.head()


Unnamed: 0,doc_id,filename,text,is_flood,is_bangladesh,flood_related,flood_climatechange,newspaper,flood_type,dates,...,event_damage-damage_info_other,event_damage-people_affected,event_damage-peopled_displaced,event_damage-homes_affected,event_damage-disease,event_damage-fatalities,event_dates,event_dates-date,event_dates-prev_date,org_text
0,ec583817-3c60-41ee-b856-65f0d9bd7772,dailySun_data_ec583817-3c60-41ee-b856-65f0d9bd...,tuesday july rise water level erosion madhumat...,True,,,,,,[],...,[],[],[],[],[],[],[],[],[],Date Published:2017-08-31 06:03:11+00:00 tuesd...
1,f4806621-b874-4f20-97fb-f7c1fa94f6bc,theDailyStar_data_f4806621-b874-4f20-97fb-f7c1...,flash floods triggered heavy rain hailstorms d...,True,1.0,1.0,0.0,daily_star,flash,[],...,[],[],[],[],[],[],[],[],[],Date Published:2016-05-05 00:00:00 Flash flood...
2,259d503d-f6b1-44b6-a866-8eff03799a07,prothomalo_data_259d503d-f6b1-44b6-a866-8eff03...,none flood situation worsened tuesday increasi...,True,,,,,,[],...,[],[],[],[],[],[],[],[],[],Date Published:None Flood situation worsened f...
3,,1989_34feabf9fb9bd7378a2a378bbc72e5c80ce50f43.txt,world learns geography quiz globally educated ...,False,0.0,0.0,0.0,ny_times,,[],...,[],[],[],[],[],[],[],[],[],AS THE WORLD LEARNS; Geography Quiz\n1989-04-0...
4,f4182e23-bcd2-4627-b5ee-6284ac4a2eac,dhakaTribune_data_f4182e23-bcd2-4627-b5ee-6284...,deceased shahadat tareque additional three peo...,False,1.0,0.0,0.0,dhaka_tribune,,[],...,[],[],[],[],[],[],[],[],[],Date Published:2019-06-14 00:00:00 \nThe ...


### Split Data

In [7]:
def make_data_ratio(df_data, test_size=None, train_size=None, shuffle_seed=4, debug=False, 
                    save_folder=None, load_folder=None, override=False, file_prefix=''):
    save_file, load_file=None, None
    if save_folder: save_file = os.path.join(save_folder,file_prefix+'data.json')
    if load_folder: load_file = os.path.join(load_folder,file_prefix+'data.json')
    
    if not override and load_file and os.path.isfile(load_file):
        if debug: print('loaded',load_file)
        js = json.load(open(load_file))
        train_df = pd.DataFrame(js['train'])
        test_df = pd.DataFrame(js['test'])
        return {'train':train_df, 'test':test_df}
    
    train_df, test_df = train_test_split(df_data, test_size=test_size, train_size=train_size, random_state=shuffle_seed, stratify=df_data['is_flood'])
    
    if debug: print('Data Loaded')

    if save_file:
        train_json = train_df.to_json(orient='records')
        test_json = test_df.to_json(orient='records')
        json.dump({'train':json.loads(train_json), 'test':json.loads(test_json)}, open(save_file,'w'), indent=2)
    return {'train':train_df, 'test':test_df}


In [8]:
save_data_folder = "/Users/jiaying/Desktop/6506 DSPP/Data Assignment #2/code/classifier"

result, clf_result = {}, {}
test_size = 0.2

if not os.path.isdir(save_data_folder): os.mkdir(save_data_folder)
debug=global_debug or False
override=global_override or False
data_split = make_data_ratio(df_data, test_size=test_size,
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override)


Data Loaded


In [9]:
print('Train:',len(data_split['train']), '\t\tTest:',len(data_split['test']))
print('Train is_flood:',len(data_split['train'].loc[data_split['train']['is_flood']==True]), \
'\tTrain not is_flood:',len(data_split['train'].loc[data_split['train']['is_flood']==False]))
print('Test is_flood:',len(data_split['test'].loc[data_split['test']['is_flood']==True]), \
'\tTest not is_flood:',len(data_split['test'].loc[data_split['test']['is_flood']==False]))


Train: 1104 		Test: 276
Train is_flood: 530 	Train not is_flood: 574
Test is_flood: 133 	Test not is_flood: 143


## Regression Classifier

### Approach 1: Regular run

In [10]:
def make_data(vect_fit, ratio):
    train, test = ratio.get('train',None), ratio.get('test',None)
    if train is None or test is None: raise Exception('Train or Test data not found')
    all_X = list(train['text'])
    
    vect = vect_fit.fit(all_X)
    trainX, testX = vect.transform(list(train['text'])), vect.transform(list(test['text']))
    trainY, testY = [1 if t else 0 for t in train['is_flood']], [1 if t else 0 for t in test['is_flood']]
    return trainX, testX, trainY, testY, vect


In [11]:
def run_classifier(clf, trainX, testX, trainY, testY):
    clf_fit = clf.fit(trainX, trainY)
    clf_pred = clf_fit.predict(testX)
    clf_acc = accuracy_score(testY, clf_pred)
    return clf_fit, clf_pred, clf_acc


In [12]:
def get_method(main_d, name):
    if name not in main_d: raise Exception('Cannot find classifier/feature_extractor name in parameter dictionary')
    d = main_d[name]
    method = d.get('method',None)
    base_method = d.get('base_method',None)
    if method and base_method: raise Exception('Cannot have method and base method both.')
    if not method and not base_method: raise Exception('Unable to parse the method from classifier/feature_extractor')
    params = d.get('params',None)
    if method:
        if params: return method, params
        else: return method, None
    if base_method:
        prev_method, prev_params = get_method(main_d, base_method)
        if params:
            for k,v in params.items(): prev_params[k] = v
        return prev_method, prev_params

def make_method(main_d, name, override_params={}):
    method, params = get_method(main_d, name)[:]
    if override_params:
        for k,v in override_params.items(): params[k] = v
    if params: return method(**params)
    else: return method()


In [13]:
def run_grid(grid, data, feature_extract, classifiers, clf_result, result, 
             debug=False, override=False, save_folder=None, load_folder=None, file_prefix=''):
    save_clf_result = {}
    vectCache, classifierCache = {}, {}
    if load_folder:
        res_file = os.path.join(load_folder,file_prefix+'clf_result.json')
        clf_res_file = os.path.join(load_folder,file_prefix+'result.json')
        if os.path.isfile(res_file): clf_result=json.load(open(res_file))
        if os.path.isfile(clf_res_file): result=json.load(open(clf_res_file))
        if os.path.isfile(res_file) and os.path.isfile(clf_res_file) and debug: print('loaded result')
    
    if override:
        clf_result, result = {}, {}
        if debug: print('OVERRIDE')
    for g in list(grid):
        try:
            feature_name = g.get('feature_extract',None)
            clf_name = g.get('classifier', None)
            if not feature_name or not clf_name:
                raise Exception('Feature Extract and Classifier Name required')
            result_key = feature_name + '-' + clf_name
            if result.get(result_key): continue
            if debug: print('Feature:', feature_name, '  Clasifier:',clf_name, '  Key:',result_key)
            
            if feature_name in vectCache:
                (trainX, testX, trainY, testY, feature2) = vectCache[feature_name]
            else:
                feature = make_method(feature_extract, feature_name)
                trainX, testX, trainY, testY, feature2 = make_data(feature, data)
                vectCache[feature_name] = (trainX, testX, trainY, testY, feature2)

            clf = make_method(classifiers, clf_name)
            clf_fit, clf_pred, clf_acc = run_classifier(clf, trainX, testX, trainY, testY)
            
            result[result_key] = {
                'feature_extract': feature_name,
                'classifier': clf_name,
                'accuracy': clf_acc
            }
            
            clf_result[result_key] = {
                'feature_extract': feature_name,
                'classifier': clf_name,
                'clf': clf_fit,
                'feature': feature2,
                'predict': clf_pred
            }
            
            save_clf_result[result_key] = {
                'feature_extract': feature_name,
                'classifier': clf_name,
                'predict': clf_pred.tolist()
            }  
        except Exception as e:
            print('Error:',e)
            continue
    if save_folder:
        json.dump(save_clf_result, open(os.path.join(load_folder,file_prefix+'clf_result.json'),'w'), indent=2)
        json.dump(result, open(os.path.join(load_folder,file_prefix+'result.json'),'w'), indent=2)
    return clf_result, result


In [14]:
def run_grid_cross_validate(grid, data, feature_extract, classifiers, result, debug=False):
    for g in list(grid):
        feature_name = g.get('feature_extract',None)
        clf_name = g.get('classifier', None)
        if not feature_name or not clf_name:
            raise Exception('Feature Extract and Classifier Name required')
        result_key = feature_name + '-' + clf_name
        if result.get(result_key): continue
        if debug: print('Feature:', feature_name, '  Clasifier:',clf_name, '  Key:',result_key)

        feature = make_method(feature_extract, 'TFIDF')
        all_X = data['text']
        all_Y = data['is_flood']
        vect = feature.fit(all_X)
        x, y = vect.transform(list(all_X)), [1 if t else 0 for t in all_Y]
        clf = make_method(classifiers, clf_name)
        cv = cross_validate(clf, x, y, cv=5,
                      scoring=('accuracy', 'precision', 'recall', 'f1'))
        result[result_key] = cv
    return result

In [15]:
def parse_result(result, clf_result=None, data=None, accuracy_threshold=None, split_by='classifier'):
    keys = list(result.keys())
    temp_df = pd.DataFrame(list(result.values()))
    if clf_result is not None and data is not None:
        presicion, recall, f1, support = [], [], [], []
        actual = [1 if f else 0 for f in list(data['test']['is_flood'])]
        for method_name in keys:
            predict = clf_result[method_name]['predict']
            pre, rec, fsc, sup = precision_recall_fscore_support(actual, predict, average='binary')
            presicion.append(pre)
            recall.append(rec)
            f1.append(fsc)
            support.append(sup)
        temp_df['f1'] = f1
        temp_df['presicion'] = presicion
        temp_df['recall'] = recall
#     temp_df['keys'] = keys
    splt_val = list(set(list(temp_df[split_by])))
    for d in splt_val:
        if accuracy_threshold:
            new_df = temp_df.loc[temp_df[split_by]==d]
            new_df = new_df.loc[new_df['accuracy']>accuracy_threshold] \
                            .drop(split_by, axis=1) \
                            .sort_values(by='accuracy',ascending=False) \
                            .reset_index(drop=True)
        else:
            new_df = temp_df.loc[temp_df[split_by]==d] \
                            .drop(split_by, axis=1) \
                            .sort_values(by='accuracy',ascending=False) \
                            .reset_index(drop=True)
        print('{}: {}'.format(split_by,d))
        print(new_df.to_markdown())
        print()

In [16]:
def compare_result(clf_result, data, method_name, conf_matrix=True, class_report=True):
    if method_name not in clf_result: raise Exception('Cannot find method')
    res = clf_result[method_name]
    new_df = data['test']
    actual = [1 if f else 0 for f in list(new_df['is_flood'])]
    new_df['predict'] = res['predict']
    if conf_matrix:
        mat = confusion_matrix(actual, res['predict'])
        plot_confusion_matrix(mat, ['Negative', 'Positive'])
        print(mat)
    if class_report: print(classification_report(actual, res['predict']))
    cr = classification_report(actual, res['predict'])
    return new_df
    

In [17]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    plt.rcParams.update({'font.size': 18})
    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [18]:
def make_vocab(ratio):
    train, test = ratio.get('train',None), ratio.get('test',None)
    if train is None or test is None: raise Exception('Train or Test data not found')
    all_X = list(train['text']) + list(test['text'])
    
    params= {
            'tokenizer': word_tokenize,
            'stop_words': 'english',
        }
    vect = CountVectorizer(**params)
    vect = vect.fit(all_X)
    return list(vect.vocabulary_.keys())

In [19]:
# Logistic Regression
vocab = make_vocab(data_split)
feature_extract = {
    'CountVect': {
        'classifier_type': 'Count Vectorizer',
        'method': CountVectorizer,
        'params': {
            'tokenizer': word_tokenize,
            'stop_words': 'english',
            'vocabulary': vocab
        }
    },
    'CountVect-2gram':{
        'base_method': 'CountVect',
        'params':{
            'ngram_range':(1,2)
        }
    },
    'CountVect-min_df-max_df':{
        'base_method': 'CountVect',
        'params':{
            'min_df': 0.05,
            'max_df': 0.95
        }
    },
    'CountVect-2gram-min_df-max_df':{
        'base_method': 'CountVect',
        'params':{
            'min_df': 0.05,
            'max_df': 0.95,
            'ngram_range':(1,2)
        }
    },
    'TFIDF': {
        'classifier_type': 'TFIDF',
        'method': TfidfVectorizer,
        'params': {
            'tokenizer': word_tokenize,
            'stop_words': 'english',
            'vocabulary': vocab
        }
    },
    'TFIDF-2gram':{
        'base_method': 'TFIDF',
        'params':{
            'ngram_range':(1,2)
        }
    },
    'TFIDF-min_df-max_df':{
        'base_method': 'TFIDF',
        'params':{
            'min_df': 0.05,
            'max_df': 0.95
        }
    },
    'TFIDF-2gram-min_df-max_df':{
        'base_method': 'TFIDF',
        'params':{
            'min_df': 0.05,
            'max_df': 0.95,
            'ngram_range':(1,2)
        }
    }
}

classifiers = {
    'RandomForest': {
        'classifier_type':'Random Forest ',
        'method': RandomForestClassifier,
        'params':{
            'class_weight':'balanced'
        }
    },
    'LinearSVC': {
        'classifier_type': 'Linear SVC',
        'method': LinearSVC,
        'params':{
            'class_weight':'balanced'
        }
    },
    'LogRegL1':{
        'classifier_type': 'Logistic Regression L1',
        'method': LogisticRegression,
        'params':{
            'penalty': 'l1',
            'class_weight':'balanced',
            'solver': 'liblinear',
            'max_iter': 1000
        }
    },
    'LogRegL2':{
        'classifier_type': 'Logistic Regression L2',
        'method': LogisticRegression,
        'params':{
            'penalty': 'l2',
            'class_weight':'balanced',
            'solver': 'liblinear',
            'max_iter': 1000
        }
    }
}

grid_parameters = {
    'feature_extract': list(feature_extract.keys()),
    'classifier': list(classifiers.keys()),
}

grid = ParameterGrid(grid_parameters)

In [20]:
override=global_override or False
debug=global_debug or False
save_results_folder = 'results/'
load_results_folder = 'results/'
if not os.path.isdir(save_results_folder): os.mkdir(save_results_folder)
clf_result, result = run_grid(grid, data_split, feature_extract, classifiers, clf_result, result, 
                              debug=debug, override=override, save_folder=save_results_folder, 
                             load_folder=load_results_folder)


loaded result
OVERRIDE
Feature: CountVect   Clasifier: RandomForest   Key: CountVect-RandomForest
Feature: CountVect-2gram   Clasifier: RandomForest   Key: CountVect-2gram-RandomForest
Feature: CountVect-min_df-max_df   Clasifier: RandomForest   Key: CountVect-min_df-max_df-RandomForest
Feature: CountVect-2gram-min_df-max_df   Clasifier: RandomForest   Key: CountVect-2gram-min_df-max_df-RandomForest
Feature: TFIDF   Clasifier: RandomForest   Key: TFIDF-RandomForest
Feature: TFIDF-2gram   Clasifier: RandomForest   Key: TFIDF-2gram-RandomForest
Feature: TFIDF-min_df-max_df   Clasifier: RandomForest   Key: TFIDF-min_df-max_df-RandomForest
Feature: TFIDF-2gram-min_df-max_df   Clasifier: RandomForest   Key: TFIDF-2gram-min_df-max_df-RandomForest
Feature: CountVect   Clasifier: LinearSVC   Key: CountVect-LinearSVC
Feature: CountVect-2gram   Clasifier: LinearSVC   Key: CountVect-2gram-LinearSVC
Feature: CountVect-min_df-max_df   Clasifier: LinearSVC   Key: CountVect-min_df-max_df-LinearSVC
Fe

In [21]:
parse_result(result, clf_result=clf_result, data=data_split)

classifier: LinearSVC
|    | feature_extract               |   accuracy |       f1 |   presicion |   recall |
|---:|:------------------------------|-----------:|---------:|------------:|---------:|
|  0 | TFIDF                         |   0.913043 | 0.911111 |    0.89781  | 0.924812 |
|  1 | TFIDF-2gram                   |   0.913043 | 0.911111 |    0.89781  | 0.924812 |
|  2 | TFIDF-min_df-max_df           |   0.913043 | 0.911111 |    0.89781  | 0.924812 |
|  3 | TFIDF-2gram-min_df-max_df     |   0.913043 | 0.911111 |    0.89781  | 0.924812 |
|  4 | CountVect                     |   0.905797 | 0.902256 |    0.902256 | 0.902256 |
|  5 | CountVect-2gram               |   0.905797 | 0.900763 |    0.914729 | 0.887218 |
|  6 | CountVect-min_df-max_df       |   0.902174 | 0.897338 |    0.907692 | 0.887218 |
|  7 | CountVect-2gram-min_df-max_df |   0.902174 | 0.897338 |    0.907692 | 0.887218 |

classifier: LogRegL2
|    | feature_extract               |   accuracy |       f1 |   presicion |

### Approach 2: Different test data graphs

In [22]:
vocab = make_vocab(data_split)
feature_extract = {
    'TFIDF': {
        'classifier_type': 'TFIDF',
        'method': TfidfVectorizer,
        'params': {
            'tokenizer': word_tokenize,
            'stop_words': 'english',
            'vocabulary': vocab,
            'ngram_range':(1,2),
            'min_df': 0.05,
            'max_df': 0.95
        }
    },
}

classifiers = {
    'LinearSVC': {
        'classifier_type': 'Linear SVC',
        'method': LinearSVC,
        'params':{
            'class_weight':'balanced'
        }
    },
    'LogRegL1':{
        'classifier_type': 'Logistic Regression L1',
        'method': LogisticRegression,
        'params':{
            'penalty': 'l1',
            'class_weight':'balanced',
            'solver': 'liblinear',
            'max_iter': 1000
        }
    },
    'LogRegL2':{
        'classifier_type': 'Logistic Regression L2',
        'method': LogisticRegression,
        'params':{
            'penalty': 'l2',
            'class_weight':'balanced',
            'solver': 'liblinear',
            'max_iter': 1000
        }
    },
    'RandomForest': {
        'classifier_type':'Random Forest ',
        'method': RandomForestClassifier,
        'params':{
            'class_weight':'balanced'
        }
    },
}

grid_parameters = {
    'feature_extract': list(feature_extract.keys()),
    'classifier': list(classifiers.keys()),
}

grid = ParameterGrid(grid_parameters)

In [23]:
# feature = make_method(feature_extract, 'TFIDF')
# all_X = df_data['text']
# all_Y = df_data['is_flood']
# vect = feature.fit(all_X)
# x, y = vect.transform(list(all_X)), [1 if t else 0 for t in all_Y]
# clf = make_method(classifiers, 'LinearSVC')
# cross_validate(clf, x, y, cv=5,
#               scoring=('accuracy', 'precision', 'recall', 'f1'))

In [24]:
overall_result = []
for train_size in [10,20,50,100,200,500,1000]:
    test_size = 270
    result, clf_result = {}, {}
    debug=True
    override=True
    data_split = make_data_ratio(df_data, test_size=test_size, train_size=train_size,
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override)
    print(len(data_split['train']), len(data_split['test']))
    actual = [i if i==True else 0 for i in data_split['test']['is_flood']]
    clf_result, result = run_grid(grid, data_split, feature_extract, classifiers, clf_result, result, 
                              debug=debug, override=override)
    for key, val in clf_result.items():
#         if key not in overall_result: overall_result[key] = []
        predict = val['predict']
        clf_acc = accuracy_score(actual, predict)
        pre, rec, fsc, sup = precision_recall_fscore_support(actual, predict, average='binary')
        d = { 'key':key, 'train_size':train_size, 'test_size':test_size, 'accuracy':clf_acc, 'precision':pre, 
            'recall':rec, 'f1':fsc,'predict':predict, 'actual': actual
        }
        overall_result.append(d)

Data Loaded
10 270
OVERRIDE
Feature: TFIDF   Clasifier: LinearSVC   Key: TFIDF-LinearSVC
Feature: TFIDF   Clasifier: LogRegL1   Key: TFIDF-LogRegL1
Feature: TFIDF   Clasifier: LogRegL2   Key: TFIDF-LogRegL2
Feature: TFIDF   Clasifier: RandomForest   Key: TFIDF-RandomForest
Data Loaded
20 270
OVERRIDE
Feature: TFIDF   Clasifier: LinearSVC   Key: TFIDF-LinearSVC
Feature: TFIDF   Clasifier: LogRegL1   Key: TFIDF-LogRegL1
Feature: TFIDF   Clasifier: LogRegL2   Key: TFIDF-LogRegL2
Feature: TFIDF   Clasifier: RandomForest   Key: TFIDF-RandomForest
Data Loaded
50 270
OVERRIDE
Feature: TFIDF   Clasifier: LinearSVC   Key: TFIDF-LinearSVC
Feature: TFIDF   Clasifier: LogRegL1   Key: TFIDF-LogRegL1
Feature: TFIDF   Clasifier: LogRegL2   Key: TFIDF-LogRegL2
Feature: TFIDF   Clasifier: RandomForest   Key: TFIDF-RandomForest
Data Loaded
100 270
OVERRIDE
Feature: TFIDF   Clasifier: LinearSVC   Key: TFIDF-LinearSVC
Feature: TFIDF   Clasifier: LogRegL1   Key: TFIDF-LogRegL1
Feature: TFIDF   Clasifier: L

In [25]:
pd.DataFrame.from_dict(overall_result) \
                .drop(['predict','actual'], axis=1) \
                .sort_values(['train_size', 'accuracy'])

Unnamed: 0,key,train_size,test_size,accuracy,precision,recall,f1
3,TFIDF-RandomForest,10,270,0.485185,0.483271,1.0,0.651629
1,TFIDF-LogRegL1,10,270,0.518519,0.0,0.0,0.0
0,TFIDF-LinearSVC,10,270,0.814815,0.844828,0.753846,0.796748
2,TFIDF-LogRegL2,10,270,0.82963,0.833333,0.807692,0.820313
5,TFIDF-LogRegL1,20,270,0.518519,0.0,0.0,0.0
7,TFIDF-RandomForest,20,270,0.662963,0.591549,0.969231,0.734694
4,TFIDF-LinearSVC,20,270,0.781481,0.831776,0.684615,0.751055
6,TFIDF-LogRegL2,20,270,0.796296,0.826087,0.730769,0.77551
9,TFIDF-LogRegL1,50,270,0.662963,0.634483,0.707692,0.669091
11,TFIDF-RandomForest,50,270,0.822222,0.762821,0.915385,0.832168


#### BERT Classifier

In [26]:
# Summarize the results
overall_result_bert = [
    {'key': 'BERT-512', 'train_size': 10, 'test_size': 270, 'accuracy': 0.7481481481481481, 
     'precision': 0.7627118644067796, 'recall': 0.6923076923076923, 'f1': 0.7258064516129032},
    {'key': 'BERT-512', 'train_size': 20, 'test_size': 270, 'accuracy': 0.737037037037037, 
     'precision': 0.7153284671532847, 'recall': 0.7538461538461538, 'f1': 0.7340823970037453},
    {'key': 'BERT-512', 'train_size': 50, 'test_size': 270, 'accuracy': 0.9037037037037037, 
     'precision': 0.8661971830985915, 'recall': 0.9461538461538461, 'f1': 0.9044117647058824},
    {'key': 'BERT-512', 'train_size': 100, 'test_size': 270, 'accuracy': 0.9296296296296296, 
     'precision': 0.9236641221374046, 'recall': 0.9307692307692308, 'f1': 0.9272030651340997},
    {'key': 'BERT-512', 'train_size': 200, 'test_size': 270, 'accuracy': 0.9407407407407408, 
     'precision': 0.9191176470588235, 'recall': 0.9615384615384616, 'f1': 0.9398496240601504},
    {'key': 'BERT-512', 'train_size': 500, 'test_size': 270, 'accuracy': 0.9481481481481482, 
     'precision': 0.9264705882352942, 'recall': 0.9692307692307692, 'f1': 0.9473684210526316},
    {'key': 'BERT-512', 'train_size': 1000, 'test_size': 270, 'accuracy': 0.9444444444444444, 
     'precision': 0.9323308270676691, 'recall': 0.9538461538461539, 'f1': 0.9429657794676807}
]

r2 = []
for i in overall_result_bert:
    i['predict'] = []
    i['actual'] = []
    r2.append(i)
overall_result_bert = r2
pd.DataFrame.from_dict(overall_result_bert) \
                .drop(['predict','actual'], axis=1) \
                .sort_values(['train_size', 'accuracy'])


Unnamed: 0,key,train_size,test_size,accuracy,precision,recall,f1
0,BERT-512,10,270,0.748148,0.762712,0.692308,0.725806
1,BERT-512,20,270,0.737037,0.715328,0.753846,0.734082
2,BERT-512,50,270,0.903704,0.866197,0.946154,0.904412
3,BERT-512,100,270,0.92963,0.923664,0.930769,0.927203
4,BERT-512,200,270,0.940741,0.919118,0.961538,0.93985
5,BERT-512,500,270,0.948148,0.926471,0.969231,0.947368
6,BERT-512,1000,270,0.944444,0.932331,0.953846,0.942966


#### OVERALL CLASSIFIER RESULTS

In [27]:
result_df = pd.DataFrame.from_dict(overall_result + overall_result_bert) \
                .drop(['predict','actual'], axis=1) \
                .sort_values(['train_size', 'accuracy'])
result_df['accuracy'] = result_df['accuracy'].apply(lambda x:round(x,2))
result_df['precision'] = result_df['precision'].apply(lambda x:round(x,2))
result_df['recall'] = result_df['recall'].apply(lambda x:round(x,2))
result_df['f1'] = result_df['f1'].apply(lambda x:round(x,2))
print(result_df.reset_index(drop=True).to_latex(index=False, float_format='%.3f'))

\begin{tabular}{lrrrrrr}
\toprule
               key &  train\_size &  test\_size &  accuracy &  precision &  recall &    f1 \\
\midrule
TFIDF-RandomForest &          10 &        270 &     0.490 &      0.480 &   1.000 & 0.650 \\
    TFIDF-LogRegL1 &          10 &        270 &     0.520 &      0.000 &   0.000 & 0.000 \\
          BERT-512 &          10 &        270 &     0.750 &      0.760 &   0.690 & 0.730 \\
   TFIDF-LinearSVC &          10 &        270 &     0.810 &      0.840 &   0.750 & 0.800 \\
    TFIDF-LogRegL2 &          10 &        270 &     0.830 &      0.830 &   0.810 & 0.820 \\
    TFIDF-LogRegL1 &          20 &        270 &     0.520 &      0.000 &   0.000 & 0.000 \\
TFIDF-RandomForest &          20 &        270 &     0.660 &      0.590 &   0.970 & 0.730 \\
          BERT-512 &          20 &        270 &     0.740 &      0.720 &   0.750 & 0.730 \\
   TFIDF-LinearSVC &          20 &        270 &     0.780 &      0.830 &   0.680 & 0.750 \\
    TFIDF-LogRegL2 &          20 & 

  print(result_df.reset_index(drop=True).to_latex(index=False, float_format='%.3f'))


### Approach 3: cross validation

In [28]:
overall_result = []
clf_result = {}
debug=True
override=True
clf_result = run_grid_cross_validate(grid, df_data, feature_extract, classifiers, clf_result, debug=debug)
for key, val in clf_result.items():
    d = { 'key':key,
         'mean_accuracy': round(np.mean(val['test_accuracy']),2), 
         'mean_precision': round(np.mean(val['test_precision']),2),
         'mean_recall': round(np.mean(val['test_recall']),2), 
         'mean_f1': round(np.mean(val['test_f1']),2),
         'accuracy':val['test_accuracy'], 'precision':val['test_precision'],
         'recall':val['test_recall'], 'f1':val['test_f1']
    }
    overall_result.append(d)

Feature: TFIDF   Clasifier: LinearSVC   Key: TFIDF-LinearSVC
Feature: TFIDF   Clasifier: LogRegL1   Key: TFIDF-LogRegL1
Feature: TFIDF   Clasifier: LogRegL2   Key: TFIDF-LogRegL2
Feature: TFIDF   Clasifier: RandomForest   Key: TFIDF-RandomForest


In [29]:
print(pd.DataFrame.from_dict(overall_result) \
                .drop(['accuracy','precision', 'recall', 'f1'], axis=1) \
                .sort_values(['mean_accuracy']).to_latex(index=False, float_format='%.3f'))
                

\begin{tabular}{lrrrr}
\toprule
               key &  mean\_accuracy &  mean\_precision &  mean\_recall &  mean\_f1 \\
\midrule
    TFIDF-LogRegL1 &          0.900 &           0.890 &        0.890 &    0.890 \\
TFIDF-RandomForest &          0.910 &           0.870 &        0.970 &    0.910 \\
    TFIDF-LogRegL2 &          0.920 &           0.890 &        0.940 &    0.920 \\
   TFIDF-LinearSVC &          0.930 &           0.900 &        0.950 &    0.930 \\
\bottomrule
\end{tabular}



  .sort_values(['mean_accuracy']).to_latex(index=False, float_format='%.3f'))


### Approach 4: 500 train_size

In [30]:
overall_result = []
train_size = 500
test_size = len(df_data)-train_size
result, clf_result = {}, {}
debug=True
override=True
data_split = make_data_ratio(df_data, test_size=test_size, train_size=train_size,
                           debug=debug, shuffle_seed=global_shuffle_seed, override=override)
print(len(data_split['train']), len(data_split['test']))
actual = [i if i==True else 0 for i in data_split['test']['is_flood']]
clf_result, result = run_grid(grid, data_split, feature_extract, classifiers, clf_result, result, 
                              debug=debug, override=override)
for key, val in clf_result.items():
#         if key not in overall_result: overall_result[key] = []
    predict = val['predict']
    clf_acc = accuracy_score(actual, predict)
    pre, rec, fsc, sup = precision_recall_fscore_support(actual, predict, average='binary')
    d = { 'key':key, 'train_size':train_size, 'test_size':test_size, 'accuracy':clf_acc, 'precision':pre, 
        'recall':rec, 'f1':fsc,'predict':predict, 'actual': actual
    }
    overall_result.append(d)


Data Loaded
500 880
OVERRIDE
Feature: TFIDF   Clasifier: LinearSVC   Key: TFIDF-LinearSVC
Feature: TFIDF   Clasifier: LogRegL1   Key: TFIDF-LogRegL1
Feature: TFIDF   Clasifier: LogRegL2   Key: TFIDF-LogRegL2
Feature: TFIDF   Clasifier: RandomForest   Key: TFIDF-RandomForest


In [31]:
print(pd.DataFrame.from_dict(overall_result) \
                .drop(['predict','actual'], axis=1) \
                .sort_values(['train_size', 'accuracy']).to_latex(index=False))


\begin{tabular}{lrrrrrr}
\toprule
               key &  train\_size &  test\_size &  accuracy &  precision &   recall &       f1 \\
\midrule
    TFIDF-LogRegL1 &         500 &        880 &  0.885227 &   0.904523 & 0.851064 & 0.876979 \\
    TFIDF-LogRegL2 &         500 &        880 &  0.904545 &   0.893271 & 0.910165 & 0.901639 \\
TFIDF-RandomForest &         500 &        880 &  0.913636 &   0.883002 & 0.945626 & 0.913242 \\
   TFIDF-LinearSVC &         500 &        880 &  0.914773 &   0.897260 & 0.929078 & 0.912892 \\
\bottomrule
\end{tabular}



  .sort_values(['train_size', 'accuracy']).to_latex(index=False))


Overall, the best approach to predict is: 

## Classify new data

In [32]:
root_folder = '/Users/jiaying/Desktop/6506 DSPP/Data Assignment #2/data'
newspapers = ['bdnews', 'dailySun', 'prothomalo', 'dailyObserver', 'newAge', 
              'dhakaTribune', 'thedailystar', 'theIndependent', 'theNewNation']
newspapers_files = [os.path.join(root_folder, 'all_paper_data', n+'1_data.json') for n in newspapers]


In [33]:
idSet = set(df_data['doc_id'])


In [34]:
def classify_new_data(newspapers_files):
    new_data = pd.DataFrame()
    text_set = set()
    for i,newspapers_path in enumerate(newspapers_files):
        temp_data = json.load(open(newspapers_path))
        temp_data2 = []
        for t in temp_data:
            if t['id'] in idSet: continue
            temp_dict = t['meta']
            for k,v in t['article'].items(): temp_dict[k]=v
            temp_dict['id'] = t['id']
            temp_dict['connect_filename'] = t.get('connect_filename',None)
            temp_dict['newspaper'] = newspapers[i]
            if t['article']['text'] not in text_set: text_set.add(t['article']['text'])
            else: continue
            temp_data2.append(temp_dict)
        temp_df = pd.DataFrame(temp_data2)
        new_data = pd.concat([new_data, temp_df])
    new_data = new_data.fillna("")
    new_data = new_data[new_data['connect_filename']==""]
    print(len(new_data))
    return new_data

new_data = classify_new_data(newspapers_files)
new_data.head()


36123


Unnamed: 0,abstract,news_keywords,description,keywords,datePublished,dateModified,link,query_info,headline,text,id,connect_filename,newspaper,authors
0,,,Massive blackout brings Bangladesh to its knees,[Bangladesh],2014-11-01 00:00:00,,https://bdnews24.com/bangladesh/2014/11/01/mas...,"{'query': 'bangladesh ""floods""', 'paper': 'bdn...",Massive blackout brings Bangladesh to its knees,Date Published:2014-11-01 00:00:00 \nSeve...,5da996ad-b1a9-490e-9c89-d8a34ecc5741,,bdnews,
1,,,Nawaz Sharif praises Bangladesh,[Bangladesh],2014-11-26 00:00:00,,https://bdnews24.com/bangladesh/2014/11/26/naw...,"{'query': 'bangladesh ""floods""', 'paper': 'bdn...",Nawaz Sharif praises Bangladesh,Date Published:2014-11-26 00:00:00 \nHe e...,d8c9576a-e4b4-4cfe-af54-dd208f699d75,,bdnews,
2,,,Govt files Tk 1 billion compensation suit for ...,[Bangladesh],2014-12-10 00:00:00,,https://bdnews24.com/bangladesh/2014/12/10/gov...,"{'query': 'bangladesh ""floods""', 'paper': 'bdn...",Govt files Tk 1 billion compensation suit for ...,Date Published:2014-12-10 00:00:00 \nThe ...,6b0a37dd-636e-4b84-93c3-e61d90e8524f,,bdnews,
3,,,"NIA looking for 11 suspects, RAB hands over li...",[Bangladesh],2014-11-18 00:00:00,,https://bdnews24.com/bangladesh/2014/11/18/nia...,"{'query': 'bangladesh ""floods""', 'paper': 'bdn...","NIA looking for 11 suspects, RAB hands over li...","Date Published:2014-11-18 00:00:00 \nRAB,...",f38f604f-bc15-4b17-8a3d-61064a54bb1c,,bdnews,
4,,,"1,019 Bangladesh nationals in UAE prison",[Bangladesh],2014-10-28 00:00:00,,https://bdnews24.com/bangladesh/2014/10/28/101...,"{'query': 'bangladesh ""floods""', 'paper': 'bdn...","1,019 Bangladesh nationals in UAE prison",Date Published:2014-10-28 00:00:00 \nTher...,ca17539b-2939-404e-bfac-21ff9f4ad1f0,,bdnews,


In [35]:
def loop_data_train_test(classifier, feature, data_df=None, predictions_folder = 'predictions',
                         prev_true_data_df=None, prev_false_data_df=None, save=True):
#     if data_df is None and prev_false_data_df is None: raise Exception('No df_data or prev_dalse_data_df')
#     if prev_false_data_df is not None: data_df = prev_false_data_df
    to_keep_cols = ['datePublished', 'text', 'doc_id', 'connect_filename', 'newspaper', 'is_flood']
    data_df['new_text'] = data_df['text'].apply(preprocess)
    
    test_features = feature.transform(list(data_df['new_text']))
    test_pred = classifier.predict(test_features)
    
    data_df['is_flood'] = [bool(i) for i in test_pred]
    data_df['doc_id'] = data_df['id']
    data_df = data_df[to_keep_cols]
    true_new_data = data_df.loc[data_df['is_flood']]
    false_new_data = data_df.loc[~data_df['is_flood']]
    print('Total New Data: {}\tTrue new Data: {}'.format(len(data_df), len(true_new_data)))
    
    if prev_true_data_df is not None: df_true_new_data = pd.concat([prev_true_data_df, true_new_data])
    else: df_true_new_data = true_new_data
    js = df_true_new_data.to_json(orient='records')
    if save: json.dump(json.loads(js), open(os.path.join(predictions_folder, 'predicted_isflood.json'), 'w'), indent=2)
    
    if prev_false_data_df is not None: df_false_new_data = pd.concat([prev_false_data_df, false_new_data])
    else: df_false_new_data = false_new_data
    js = df_false_new_data.to_json(orient='records')
    if save: json.dump(json.loads(js), open(os.path.join(predictions_folder, 'predicted_not_isflood.json'), 'w'), indent=2)
    return df_true_new_data, false_new_data


In [36]:
key = 'TFIDF-LinearSVC'
feature = clf_result[key]['feature']
classifier = clf_result[key]['clf']
prev_true_data_df, prev_false_data_df = None, None
# prev_true_data_df, prev_false_data_df = get_new_predicted_data()
save = False
df_true_new_data, false_new_data = loop_data_train_test(classifier, feature, new_data, 'predictions',
                                                        prev_true_data_df, prev_false_data_df, save=save)


Total New Data: 36123	True new Data: 2140


In [37]:
df_true_new_data

Unnamed: 0,datePublished,text,doc_id,connect_filename,newspaper,is_flood
327,2016-09-03 00:00:00,Date Published:2016-09-03 00:00:00 \nIn r...,a333e636-7afc-46a9-a6e7-a49da3ca3d67,,bdnews,True
365,2016-08-29 00:00:00,Date Published:2016-08-29 00:00:00 \nInfo...,9b0133da-fc4d-4735-b44b-ddacb19cf3e6,,bdnews,True
395,2017-07-23 00:00:00,Date Published:2017-07-23 00:00:00 \nThe ...,d8d651c7-7b04-4599-972a-4fc452ba4a9e,,bdnews,True
438,2017-06-18 00:00:00,Date Published:2017-06-18 00:00:00 \nLaxm...,422230e1-c97d-499e-bf84-2f8014ead923,,bdnews,True
454,2017-09-23 00:00:00,Date Published:2017-09-23 00:00:00 \nRoad...,74b887c9-efb2-4d74-a6ef-5eaf24cf679e,,bdnews,True
...,...,...,...,...,...,...
3697,2016-08-02 00:00:00,Date Published:2016-08-02 00:00:00 \nThe ...,61d48782-dcc6-4190-8f98-73b19c771056,,theIndependent,True
3702,2016-08-24 00:00:00,Date Published:2016-08-24 00:00:00 \nIndi...,f533b0df-e589-4e29-8872-74cfd250355f,,theIndependent,True
3705,2016-07-27 00:00:00,Date Published:2016-07-27 00:00:00 \nThe ...,e7f78328-d806-442e-805b-3f92e1f2daab,,theIndependent,True
3763,2019-04-07 10:06:58,Date Published:2019-04-07 10:06:58 \nEros...,b0c65117-21ba-44c4-a06e-edbd3a43900c,,theIndependent,True
