In [4]:
import gensim
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from hypopt import GridSearch
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import re
import os
from tqdm.auto import tqdm

from data_loader import DataLoader

init_notebook_mode(connected = True)

In [5]:
data = DataLoader()

In [6]:
general = data.get_dfs()

In [7]:
def get_doc2vec(dataframes, vector_size):
    '''Returns doc2vec representation of the dataset'''
    # Directory name for saving the datasets
    dataset_dir = 'doc2vec'

    def create_embeddings(df, vec_size):
        '''Create doc2vec embeddings from dataframe'''
        # Create a training corpus
        train_corpus = [gensim.models.doc2vec.TaggedDocument(row.statement, [index]) for index, row in df['train'].iterrows()]

        # Set model parameters
        model = gensim.models.doc2vec.Doc2Vec(vector_size = vec_size, min_count = 2, epochs = 40)

        # Build the vocabulary
        model.build_vocab(train_corpus)

        # Train the model
        model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

        # Apply model to all statements
        embedded_df = df.copy()
        for dataset in df:
            embedded_df[dataset]['statement'] = df[dataset]['statement'].apply(lambda statement: model.infer_vector(statement))

        return embedded_df

    def create_dataframe(df):
        '''Create an doc2vec dataframe from another dataframe'''
        doc2vec = {}

        for dataset in df.keys():
            # Reduce columns
            doc2vec[dataset] = df[dataset][['label', 'statement']]

            # Preprocess statements
            doc2vec[dataset]['statement'] = doc2vec[dataset]['statement'].map(lambda statement: gensim.utils.simple_preprocess(statement))

        return doc2vec

    def init():
        '''Initialize all logic from the main function'''
        # Apply transformations to dataframe
        doc2vec = create_dataframe(dataframes)
        doc2vec = create_embeddings(doc2vec, vector_size)

        return doc2vec

    return init()

In [8]:
# Recode labels from 6 to 3
def recode(label):
    if label == 'false' or label == 'pants-fire' or label == 'barely-true':
        return 'false'
    elif label == 'true' or label == 'mostly-true':
        return 'true'
    elif label == 'half-true':
        return 'half-true'

for dataset in general.keys():
    general[dataset]['label'] = general[dataset]['label'].apply(lambda label: recode(label))

In [9]:
def get_classifier_score(clf, X_train, X_test, X_validation, y_train = general['train']['label'], y_test = general['test']['label'], y_validation = general['validation']['label']):
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
    gs = GridSearch(model = eval(clf + '()'), param_grid = param_grid)
    gs.fit(X_train, y_train, X_validation, y_validation)
    
    return gs.score(X_test, y_test)

In [0]:
doc2vec_accuracies = {}

In [16]:
for classifier in ['LogisticRegression']:
    doc2vec_accuracies[classifier] = {}
    
    for vec_len in tqdm(np.array(range(1800,5050))[::50][1:]):
        dfs = get_doc2vec(general, vec_len)
        doc2vec_accuracies[classifier][vec_len] = get_classifier_score(classifier, list(dfs['train']['statement'].values), list(dfs['test']['statement'].values), list(dfs['validation']['statement'].values))
    



  0%|          | 0/64 [00:00<?, ?it/s][A[A

  2%|▏         | 1/64 [04:23<4:36:12, 263.06s/it][A[A

  3%|▎         | 2/64 [08:45<4:31:40, 262.90s/it][A[A

  5%|▍         | 3/64 [13:30<4:34:02, 269.55s/it][A[A

  6%|▋         | 4/64 [18:07<4:31:35, 271.59s/it][A[A

  8%|▊         | 5/64 [22:53<4:31:34, 276.18s/it][A[A

  9%|▉         | 6/64 [27:51<4:33:13, 282.65s/it][A[A

 11%|█         | 7/64 [32:47<4:32:14, 286.56s/it][A[A

 12%|█▎        | 8/64 [37:47<4:31:13, 290.59s/it][A[A

 14%|█▍        | 9/64 [42:57<4:31:47, 296.50s/it][A[A

 16%|█▌        | 10/64 [48:23<4:34:43, 305.24s/it][A[A

 17%|█▋        | 11/64 [53:51<4:35:47, 312.23s/it][A[A

 19%|█▉        | 12/64 [59:21<4:35:13, 317.57s/it][A[A

 20%|██        | 13/64 [1:05:00<4:35:22, 323.96s/it][A[A

 22%|██▏       | 14/64 [1:10:54<4:37:23, 332.88s/it][A[A

 23%|██▎       | 15/64 [1:16:47<4:36:51, 339.02s/it][A[A

 25%|██▌       | 16/64 [1:22:57<4:38:41, 348.36s/it][A[A

 27%|██▋       | 17/64 [1

In [23]:
doc2vec_accuracies

{'LogisticRegression': {50: 0.4671936758893281,
  100: 0.46561264822134385,
  150: 0.47193675889328063,
  200: 0.4766798418972332,
  250: 0.48300395256916995,
  300: 0.4806324110671937,
  350: 0.475098814229249,
  400: 0.46877470355731227,
  450: 0.475098814229249,
  500: 0.4743083003952569,
  550: 0.46403162055335967,
  600: 0.47114624505928854,
  650: 0.48300395256916995,
  700: 0.466403162055336,
  750: 0.4782608695652174,
  800: 0.48379446640316204,
  850: 0.47193675889328063,
  900: 0.4758893280632411,
  950: 0.4758893280632411,
  1000: 0.4766798418972332,
  1050: 0.47114624505928854,
  1100: 0.4758893280632411,
  1150: 0.46877470355731227,
  1200: 0.4758893280632411,
  1250: 0.46877470355731227,
  1300: 0.45770750988142295,
  1350: 0.4790513833992095,
  1400: 0.48300395256916995,
  1450: 0.4743083003952569,
  1500: 0.4790513833992095,
  1550: 0.4798418972332016,
  1600: 0.4743083003952569,
  1650: 0.48695652173913045,
  1700: 0.4790513833992095,
  1750: 0.4727272727272727,
  1800

In [3]:
traces = doc2vec_accuracies

# Create traces
def create_scatter(trace):    
    return go.Scatter(
        x = list(traces[trace].keys()),
        y = list(traces[trace].values()),
        mode = 'lines+markers',
        name = trace
    )

trace_data = [create_scatter(trace) for trace in doc2vec_accuracies.keys()]

layout = go.Layout(
    title = 'Test set accuracy of doc2vec dataset with variable vector lengths',
)

fig = go.Figure(data = trace_data, layout = layout)

iplot(fig)

In [10]:
dfs = get_doc2vec(general, 4600)
print(get_classifier_score(classifier, list(dfs['train']['statement'].values), list(dfs['test']['statement'].values), list(dfs['validation']['statement'].values)))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



NameError: name 'classifier' is not defined