In [2]:
import joblib

estimator = joblib.load("estimator.joblib")


In [15]:
import pandas as pd


col_names = [
        'id',               # Column 1: the ID of the statement ([ID].json).
        'label',            # Column 2: the label.
        'statement',        # Column 3: the statement.
        'subjects',         # Column 4: the subject(s).
        'speaker',          # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',       # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        'barely_true', # barely true counts.
        'false', # false counts.
        'half_true', # half true counts.
        'mostly_true', # mostly true counts.
        'pants_on_fire', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
    ]
def read_df(tsv_file: str) -> pd.DataFrame:
    df = pd.read_csv(tsv_file, delimiter='\t', dtype=object)
    # replaces all "null" or "NaN" values with an empty string
    df.fillna("", inplace=True)
    # labels the columns in the dataset using the data dictionary described in the README
    df.columns = col_names
    df = df.dropna(subset=["label", "statement"])
    df.drop(["id"], axis=1, inplace=True)
    return df

train_df = read_df('datasets/train.tsv')
test_df = read_df('datasets/test.tsv')
valid_df = read_df('datasets/valid.tsv')

In [17]:
train_df["statement"]

0        When did the decline of coal start? It started...
1        Hillary Clinton agrees with John McCain "by vo...
2        Health care reform legislation is likely to ma...
3        The economic turnaround started at the end of ...
4        The Chicago Bears have had more starting quart...
                               ...                        
10234    There are a larger number of shark attacks in ...
10235    Democrats have now become the party of the [At...
10236    Says an alternative to Social Security that op...
10237    On lifting the U.S. Cuban embargo and allowing...
10238    The Department of Veterans Affairs has a manua...
Name: statement, Length: 10239, dtype: object

In [19]:
estimator.predict(train_df["statement"])[:10]

array([1, 1, 0, 1, 1, 0, 0, 1, 0, 1])

In [21]:
train_df["label"][:10]

0      half-true
1    mostly-true
2          false
3      half-true
4           true
5    barely-true
6      half-true
7      half-true
8    mostly-true
9    mostly-true
Name: label, dtype: object