In [3]:
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one
from imblearn.pipeline import Pipeline as Imb_Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from imblearn.over_sampling import SMOTE
import warnings
import time
import FilterDFTool
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.corpus import stopwords
import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from DenseTfidfVectorizer import DenseTfidfVectorizer
from joblib import Parallel, delayed
from scipy import sparse
import re
from sklearn.svm import LinearSVC
from sklearn.exceptions import ConvergenceWarning
from functools import partial, update_wrapper
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score, make_scorer, confusion_matrix
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_selection import SelectKBest, chi2

warnings.filterwarnings('ignore')

# 1. Read data

In [4]:
experiment_setups = [
                     [True, False, False, False, False, 'svm', -1], #tfidf
                     [False, True, False, False, False, 'svm', -1], # conversational
                     [False, False, True, False, False, 'svm', -1], # liwc
                     [False, False, False, True, False, 'svm', -1], # politeness
                     [False, False, False, False, True, 'svm', -1], # pos_tfidf
                     [True, True, True, True, True, 'svm', -1],
                     [True, False, False, False, False, 'cnb', -1], #tfidf
                     [False, True, False, False, False, 'cnb', -1], # conversational
                     [False, False, True, False, False, 'cnb', -1], # liwc
                     [False, False, False, True, False, 'cnb', -1], # politeness
                     [False, False, False, False, True, 'cnb', -1], # pos_tfidf
                     [True, True, True, True, True, 'cnb', -1]
]

# each correspondes to these keys
# 1. use_tfidf 
# 2. use_conversational
# 3. use_liwc
# 4. use_politeness
# 5. use_pos_tfidf

In [5]:
df = pd.read_csv('Preprocessed.csv')
booleanDictionary = {'True': True, 'False': False}
df.replace(booleanDictionary, inplace=True)

In [6]:
label_col = 'is_non_argumentative'

text_content_name = ['Text Content']
conver_feature_name = ['author_association', 'is_poster', 'quote_len',
       'percent_len_thread', 'percent_len_comment', 'percent_pos_thread',
       'percent_pos_comment', 'is_first_comment', 'is_last_comment',
       'percent_start_time_thread', 'percent_end_time_thread',
       'percent_previous_time_thread', 'percent_next_time_thread',
       'contain_code']
liwc_feature_name = ['WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr',
       'Dic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe',
       'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj',
       'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant',
       'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social',
       'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause',
       'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear',
       'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives',
       'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast',
       'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time',
       'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal',
       'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc',
       'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash',
       'Quote', 'Apostro', 'Parenth', 'OtherP']
politeness_feature_name = ['polite_score']
pos_str_name = ['pos_str']

In [7]:
full_feature_name = text_content_name + conver_feature_name + liwc_feature_name + politeness_feature_name + pos_str_name

irrelevant_source_data_pipeline = Pipeline([
    ('filtered_source', FilterDFTool.GetDataRowByLabels([], [])),
    ('data_cols', FilterDFTool.GetDataColByNames( np.append(label_col, full_feature_name)))
])

In [8]:
source_data = irrelevant_source_data_pipeline.fit_transform(X=df)

In [9]:
source_data['author_association'].unique()

array(['NOT FOUND', 'CONTRIBUTOR', 'NONE', 'MEMBER'], dtype=object)

In [10]:
pd.options.mode.chained_assignment = None  # default='warn'
source_data.loc[source_data['author_association'] == 'NOT FOUND', 'author_association'] = 0
source_data.loc[source_data['author_association'] == 'NONE', 'author_association'] = 1
source_data.loc[source_data['author_association'] == 'CONTRIBUTOR', 'author_association'] = 2
source_data.loc[source_data['author_association'] == 'MEMBER', 'author_association'] = 3

In [11]:
display(source_data)

Unnamed: 0,is_irrelevant,Text Content,author_association,is_poster,quote_len,percent_len_thread,percent_len_comment,percent_pos_thread,percent_pos_comment,is_first_comment,...,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP,polite_score,pos_str
0,False,Is it possible to add feature in order to forc...,0,True,152,0.750000,1.000000,0.014035,0.800000,False,...,0.00,3.45,0.00,0.00,0.00,0.00,0.00,0.00,0.634124,VBZ PRP JJ TO VB NN IN NN TO VB NN TO VB NN RB...
1,False,This can be very useful when there is a lot of...,0,True,95,0.500000,0.666667,0.017544,1.000000,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.297346,DT MD VB RB JJ WRB EX VBZ DT NN IN NNS CC PRP ...
2,True,"I'm using VS Code VERSION_NUM Insider, but I ...",2,False,153,0.650000,1.000000,0.021053,1.000000,True,...,3.33,0.00,0.00,6.67,0.00,3.33,0.00,6.67,0.619835,"PRP VBP VBG NNP NNP NNP SYM NNP NNP , CC PRP V..."
3,False,otherwise there is no reason for double click,1,True,45,0.200000,0.888889,0.028070,1.000000,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.426355,RB EX VBZ DT NN IN JJ NN
4,False,SCREEN_NAME The 'preview' function does not k...,2,False,190,0.825000,1.000000,0.031579,0.500000,False,...,0.00,0.00,0.00,0.00,0.00,5.88,5.88,2.94,0.550242,NN SYM NNP DT `` NN '' NN VBZ RB VB DT NN JJ W...
5,False,"Not sure what you are going for here, why woul...",2,False,127,0.625000,0.757576,0.035088,1.000000,False,...,0.00,3.85,0.00,3.85,0.00,0.00,7.69,0.00,0.160382,"RB JJ WP PRP VBP VBG IN RB , WRB MD PRP RB -LR..."
6,False,I also am trying to figure out how to do this,1,False,45,0.275000,0.392857,0.038596,0.066667,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.368622,PRP RB VBP VBG TO VB RP WRB TO VB DT
7,False,because I want the explorer panel to have focu...,1,False,131,0.700000,1.000000,0.045614,0.200000,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.647934,"IN PRP VBP DT NN NN TO VB NN , PRP MD RB VB DT..."
8,False,I was hoping I could type the name of a file a...,1,False,113,0.600000,0.857143,0.059649,0.466667,False,...,0.00,0.00,0.00,0.00,0.00,4.17,0.00,0.00,0.548465,PRP VBD VBG PRP MD NN DT NN IN DT NN CC PRP VB...
9,False,even with KB focus in the explorer window it d...,1,False,61,0.300000,0.428571,0.066667,0.600000,False,...,0.00,0.00,0.00,0.00,0.00,8.33,0.00,0.00,0.450140,RB IN NN NN IN DT NN NN PRP VBZ RB VB DT .


# 2. Train

In [12]:
from spacy.lang.en import English
eng = spacy.load('en')

parser = English()
lemmatizer = WordNetLemmatizer()

stop_words = list(punctuation) + ["'s","'m","n't","'re","-","'ll",'...'] #+ stopwords.words('english')

In [13]:
def word_tokenize(line):
    line_tokens = []
    tokens = parser(line)
    for token in tokens:
        token_str = str(token)
        if token.orth_.isspace():
            continue
        elif str(token) not in stop_words:
            line_tokens.append(lemmatizer.lemmatize(token.lower_))
    return line_tokens

In [14]:
def pos_tokenize(line):
    line_tokens = []
    tokens = parser(line)
    for token in tokens:
        token_str = str(token)
        if token.orth_.isspace():
            continue
        elif str(token) not in list(punctuation):
            line_tokens.append(str(token))
    return line_tokens

In [15]:
df_con = source_data[['author_association', 'is_poster', 'quote_len',
       'percent_len_thread', 'percent_len_comment', 'percent_pos_thread',
       'percent_pos_comment', 'is_first_comment', 'is_last_comment',
       'percent_start_time_thread', 'percent_end_time_thread',
       'percent_previous_time_thread', 'percent_next_time_thread',
       'contain_code']]
df_con[(df_con < 0).any(1)]

Unnamed: 0,author_association,is_poster,quote_len,percent_len_thread,percent_len_comment,percent_pos_thread,percent_pos_comment,is_first_comment,is_last_comment,percent_start_time_thread,percent_end_time_thread,percent_previous_time_thread,percent_next_time_thread,contain_code
2528,1,False,52,0.105263,1.0,1.0,1.0,False,True,1.0,0.0,0.000129,-0.003754,False


In [16]:
# CAUTIOM
source_data.at[2528, 'percent_next_time_thread'] = 0

In [17]:
x = source_data[full_feature_name]
y = source_data[label_col]

In [18]:
source_data

Unnamed: 0,is_irrelevant,Text Content,author_association,is_poster,quote_len,percent_len_thread,percent_len_comment,percent_pos_thread,percent_pos_comment,is_first_comment,...,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP,polite_score,pos_str
0,False,Is it possible to add feature in order to forc...,0,True,152,0.750000,1.000000,0.014035,0.800000,False,...,0.00,3.45,0.00,0.00,0.00,0.00,0.00,0.00,0.634124,VBZ PRP JJ TO VB NN IN NN TO VB NN TO VB NN RB...
1,False,This can be very useful when there is a lot of...,0,True,95,0.500000,0.666667,0.017544,1.000000,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.297346,DT MD VB RB JJ WRB EX VBZ DT NN IN NNS CC PRP ...
2,True,"I'm using VS Code VERSION_NUM Insider, but I ...",2,False,153,0.650000,1.000000,0.021053,1.000000,True,...,3.33,0.00,0.00,6.67,0.00,3.33,0.00,6.67,0.619835,"PRP VBP VBG NNP NNP NNP SYM NNP NNP , CC PRP V..."
3,False,otherwise there is no reason for double click,1,True,45,0.200000,0.888889,0.028070,1.000000,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.426355,RB EX VBZ DT NN IN JJ NN
4,False,SCREEN_NAME The 'preview' function does not k...,2,False,190,0.825000,1.000000,0.031579,0.500000,False,...,0.00,0.00,0.00,0.00,0.00,5.88,5.88,2.94,0.550242,NN SYM NNP DT `` NN '' NN VBZ RB VB DT NN JJ W...
5,False,"Not sure what you are going for here, why woul...",2,False,127,0.625000,0.757576,0.035088,1.000000,False,...,0.00,3.85,0.00,3.85,0.00,0.00,7.69,0.00,0.160382,"RB JJ WP PRP VBP VBG IN RB , WRB MD PRP RB -LR..."
6,False,I also am trying to figure out how to do this,1,False,45,0.275000,0.392857,0.038596,0.066667,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.368622,PRP RB VBP VBG TO VB RP WRB TO VB DT
7,False,because I want the explorer panel to have focu...,1,False,131,0.700000,1.000000,0.045614,0.200000,False,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.647934,"IN PRP VBP DT NN NN TO VB NN , PRP MD RB VB DT..."
8,False,I was hoping I could type the name of a file a...,1,False,113,0.600000,0.857143,0.059649,0.466667,False,...,0.00,0.00,0.00,0.00,0.00,4.17,0.00,0.00,0.548465,PRP VBD VBG PRP MD NN DT NN IN DT NN CC PRP VB...
9,False,even with KB focus in the explorer window it d...,1,False,61,0.300000,0.428571,0.066667,0.600000,False,...,0.00,0.00,0.00,0.00,0.00,8.33,0.00,0.00,0.450140,RB IN NN NN IN DT NN NN PRP VBZ RB VB DT .


In [19]:
# followed the tutorial at: https://zablo.net/blog/post/pandas-dataframe-in-scikit-learn-feature-union/index.html
class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(
                transformer=trans,
                X=X,
                y=y,
                weight=weight,
                **fit_params)
            for name, trans, weight in self._iter())
        if not result:
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(
                transformer=trans,
                X=X,
                y=None,
                weight=weight)
            for name, trans, weight in self._iter())
        if not Xs:
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [20]:
def BuildPipeline(use_tfidf, use_conversational, use_liwc, use_politeness, use_pos_tfidf, classifier, num_select_k):
    feature_flag_str = classifier
    ready_feature_name = []
    if num_select_k != -1:
        feature_flag_str = feature_flag_str + '_k' + str(num_select_k)
    if use_conversational:
        feature_flag_str = feature_flag_str + '_conver'
        ready_feature_name += conver_feature_name
    if use_liwc:
        feature_flag_str = feature_flag_str + '_liwc'
        ready_feature_name += liwc_feature_name
    if use_politeness:
        feature_flag_str = feature_flag_str + '_polite'
        ready_feature_name += politeness_feature_name

    ready_feature = FilterDFTool.GetDataColByNames(ready_feature_name)
    
    tfidf_features = Pipeline([
        ('extract_field', FunctionTransformer(lambda x: x['Text Content'], validate=False)),
        ('vect', DenseTfidfVectorizer(tokenizer=word_tokenize, analyzer='word'))
    ])

    pos_tfidf_features = Pipeline([
        ('extract_field', FunctionTransformer(lambda x: x['pos_str'], validate=False)),
        ('vect', DenseTfidfVectorizer(tokenizer=pos_tokenize, analyzer='word'))
    ])

    if use_tfidf and use_pos_tfidf:
        feature_flag_str = feature_flag_str + '_tfidf'
        feature_flag_str = feature_flag_str + '_postfidf'
        feature_union = PandasFeatureUnion([
            ('ready_feature', ready_feature),
            ('tfidf_feature', tfidf_features),
            ('pos_tfidf_feature', pos_tfidf_features)
        ])
        param_grid = {
            'feature__pos_tfidf_feature__vect__ngram_range': ((1,1),(1,2)),
            'feature__tfidf_feature__vect__ngram_range': ((1,1),(1,2)),
        }
    elif use_tfidf:
        feature_flag_str = feature_flag_str + '_tfidf'
        feature_union = PandasFeatureUnion([
            ('ready_feature', ready_feature),
            ('tfidf_feature', tfidf_features)
        ])
        param_grid = {
            'feature__tfidf_feature__vect__ngram_range': ((1,1),(1,2)),
        }

    elif use_pos_tfidf:
        feature_flag_str = feature_flag_str + '_postfidf'
        feature_union = PandasFeatureUnion([
            ('ready_feature', ready_feature),
            ('pos_tfidf_feature', pos_tfidf_features)
        ])
        param_grid = {
            'feature__pos_tfidf_feature__vect__ngram_range': ((1,1),(1,2)),
        }
    else:
        feature_union = PandasFeatureUnion([
            ('ready_feature', ready_feature)
        ])
        param_grid = {}

    if classifier == 'svm':
        if num_select_k != -1:
            pipeline = Imb_Pipeline([
                ('feature', feature_union),
                ('select_k', SelectKBest(chi2, k=num_select_k)),
                ('clf', LinearSVC(class_weight='balanced'))
            ])
        else:
            pipeline = Imb_Pipeline([
                ('feature', feature_union),
                ('clf', LinearSVC(class_weight='balanced'))
            ])
        param_grid['clf__C'] = (0.01, 0.1, 1, 10)
        
    else: # complement NB
        if num_select_k != -1:
            pipeline = Imb_Pipeline([
                ('feature', feature_union),
                ('select_k', SelectKBest(chi2, k=num_select_k)),
                ('clf', ComplementNB())
            ])
        else:
            pipeline = Imb_Pipeline([
                ('feature', feature_union),
                ('clf', ComplementNB())
            ])
        param_grid['clf__alpha'] = (0.01, 0.1, 1, 10)
        
    return pipeline, param_grid, feature_flag_str

In [21]:
### Define and create the scoring functions

def score_func(y_true, y_pred, score_index, i):
    return(precision_recall_fscore_support(y_true,y_pred)[score_index][i])

def avg_score(y_true, y_pred, score_index):
    return precision_recall_fscore_support(y_true,y_pred,average='weighted')[score_index]

def sum_support(y_true, y_pred):
    return len(y_true)

### Create partials for each of the metrics returned
score_funcs = {v: partial(score_func, score_index=k) for k, v in {0:'precision',1:'recall',2:'fscore',3:'support'}.items()}
prec_score = partial(score_func, score_index=0)
update_wrapper(prec_score,score_func)
rec_score = partial(score_func, score_index=1)
update_wrapper(rec_score,score_func)
f_score = partial(score_func, score_index=2)
update_wrapper(f_score,score_func)
support_score = partial(score_func, score_index=3)
update_wrapper(support_score,score_func)

### Create a callable scoring function for each of the metrics for each classification label
scorer = {}
for label_id in range(0,2):
    scorer['label'+str(label_id)+'_precision'] = make_scorer(prec_score, i=label_id)
    scorer['label'+str(label_id)+'_recall'] = make_scorer(rec_score, i=label_id)
    scorer['label'+str(label_id)+'_fscore'] = make_scorer(f_score, i=label_id)
    scorer['label'+str(label_id)+'_support'] = make_scorer(support_score, i=label_id)

### Create a callable scoring function for avg/total of the metrics across classification labels
scorer['avg_precision'] = make_scorer(avg_score,score_index=0)
scorer['avg_recall'] = make_scorer(avg_score,score_index=1)
scorer['avg_fscore'] = make_scorer(avg_score,score_index=2)
scorer['total_support'] = make_scorer(sum_support)

In [22]:
# To be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# To be used in outer CV
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [23]:
labels = ['argumentative', 'non-argumentative']

In [24]:
for experiment in experiment_setups:
    use_tfidf = experiment[0]
    use_conver = experiment[1]
    use_liwc = experiment[2]
    use_polite = experiment[3]
    use_pos_tfidf = experiment[4]
    classifier = experiment[5]
    num_select_k = experiment[6]
    svm_pipeline, param_grid, feature_flag_str = BuildPipeline(use_tfidf, use_conver, use_liwc, use_polite, use_pos_tfidf, classifier, num_select_k)
    
    # write to grid_search file
    f= open("results_argumentative/5Fold_grid_search_" + feature_flag_str + ".txt","w+")

    f.write("Performing grid search...\n")
    f.write("pipeline:" + ' '.join([name for name, _ in svm_pipeline.steps]) + "\n")
    f.write("parameters:")
    f.write(str(param_grid) + '\n')
    t0 = time.time()

    clf = GridSearchCV(svm_pipeline, param_grid, cv=inner_cv, scoring='f1_weighted')
    clf_results = cross_validate(clf, X=x, y=y, cv=outer_cv, scoring=scorer, return_train_score=True)

    f.write("done in %0.3fs\n" % (time.time() - t0))

    f.close() 
    
    train_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])
    test_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])

    result_dict = {}

    writer = pd.ExcelWriter('results_argumentative/5Fold_result_'+ feature_flag_str +'.xlsx')

    datalength = 0

    for i in range(0,5):
        for label_id in range(0,2):
            train_report.loc[labels[label_id], :] = [clf_results['train_label'+str(label_id)+'_precision'][i],clf_results['train_label'+str(label_id)+'_recall'][i],clf_results['train_label'+str(label_id)+'_fscore'][i],clf_results['train_label'+str(label_id)+'_support'][i]]
            test_report.loc[labels[label_id], :] = [clf_results['test_label'+str(label_id)+'_precision'][i],clf_results['test_label'+str(label_id)+'_recall'][i],clf_results['test_label'+str(label_id)+'_fscore'][i],clf_results['test_label'+str(label_id)+'_support'][i]]

        train_report.loc['Avg/Total', :] = [clf_results['train_avg_precision'][i],clf_results['train_avg_recall'][i],clf_results['train_avg_fscore'][i],clf_results['train_total_support'][i]]
        test_report.loc['Avg/Total', :] = [clf_results['test_avg_precision'][i],clf_results['test_avg_recall'][i],clf_results['test_avg_fscore'][i],clf_results['test_total_support'][i]]

        fold_index = pd.DataFrame(data=[{'Fold':'Fold '+str(i)}])
        fold_index.to_excel(writer,'result',startrow=datalength, index=False)
        datalength+=(len(fold_index)+2)
        train_report.to_excel(writer,'result',startrow=datalength)
        datalength+=(len(train_report)+2)
        test_report.to_excel(writer,'result',startrow=datalength)
        datalength+=(len(test_report)+2)

        result_dict['result_train_'+str(i)] = train_report
        result_dict['result_test_'+str(i)] = test_report

        train_report = train_report.astype(float).round(2)
        test_report = test_report.astype(float).round(2)

        print("\n------------------------- FOLD "+str(i)+": -------------------------")
        print("\nTraining Results:")
        print(train_report)
        print("\nTest Results:")
        print(test_report)

    writer.save()


------------------------- FOLD 0: -------------------------

Training Results:
            Precision  Recall  F1-score  Support
relevant         1.00    0.99      0.99   2427.0
irrelevant       0.99    1.00      0.99   1671.0
Avg/Total        0.99    0.99      0.99   4098.0

Test Results:
            Precision  Recall  F1-score  Support
relevant         0.80    0.83      0.82    607.0
irrelevant       0.74    0.70      0.72    418.0
Avg/Total        0.78    0.78      0.78   1025.0

------------------------- FOLD 1: -------------------------

Training Results:
            Precision  Recall  F1-score  Support
relevant         1.00    0.99      1.00   2427.0
irrelevant       0.99    1.00      0.99   1671.0
Avg/Total        0.99    0.99      0.99   4098.0

Test Results:
            Precision  Recall  F1-score  Support
relevant         0.80    0.85      0.83    607.0
irrelevant       0.76    0.69      0.73    418.0
Avg/Total        0.79    0.79      0.79   1025.0

-------------------------


------------------------- FOLD 0: -------------------------

Training Results:
            Precision  Recall  F1-score  Support
relevant         0.99    0.99      0.99   2427.0
irrelevant       0.99    0.98      0.98   1671.0
Avg/Total        0.99    0.99      0.99   4098.0

Test Results:
            Precision  Recall  F1-score  Support
relevant         0.79    0.87      0.83    607.0
irrelevant       0.78    0.67      0.72    418.0
Avg/Total        0.79    0.79      0.78   1025.0

------------------------- FOLD 1: -------------------------

Training Results:
            Precision  Recall  F1-score  Support
relevant         0.99    0.99      0.99   2427.0
irrelevant       0.99    0.98      0.98   1671.0
Avg/Total        0.99    0.99      0.99   4098.0

Test Results:
            Precision  Recall  F1-score  Support
relevant         0.78    0.84      0.81    607.0
irrelevant       0.74    0.65      0.69    418.0
Avg/Total        0.76    0.76      0.76   1025.0

-------------------------