#### Mateusz Zacharecki 313549
# Warsztaty badawcze 2 - Fact checking project

In [32]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import FunctionTransformer, TargetEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import spacy
import category_encoders as ce
from doc2vec_sklearn import Doc2VecVectorizer
import transformers
from scipy.special import expit
from sklearn.ensemble import VotingClassifier

In [2]:
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test_noy.tsv", sep='\t')

## EDA

In [3]:
train.head()

Unnamed: 0,label,statement,subject,speaker,speaker_job,state,party,context
0,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,a floor speech.
1,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,Denver
2,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,a news release
3,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,an interview on CNN
4,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,a an online opinion-piece


In [4]:
train.isna().sum()

label             0
statement         0
subject           0
speaker           0
speaker_job    2902
state          2211
party             0
context         100
dtype: int64

In [5]:
train['party'].value_counts()

party
republican                      4509
democrat                        3345
none                            1746
organization                     220
independent                      149
newsmaker                         58
activist                          40
libertarian                       40
journalist                        38
columnist                         36
talk-show-host                    26
state-official                    20
labor-leader                      11
tea-party-member                  10
business-leader                    9
green                              3
education-official                 2
liberal-party-canada               1
government-body                    1
Moderate                           1
democratic-farmer-labor            1
ocean-state-tea-party-action       1
constitution-party                 1
Name: count, dtype: int64

In [6]:
train['state'].value_counts()

state
Texas             1008
Florida           1001
Wisconsin          716
New York           658
Illinois           562
                  ... 
Qatar                1
ohio                 1
Virginia             1
United Kingdom       1
Rhode Island         1
Name: count, Length: 83, dtype: int64

In [7]:
train['speaker_job'].value_counts()

speaker_job
President                                                      497
U.S. Senator                                                   480
Governor                                                       391
President-Elect                                                274
U.S. senator                                                   263
                                                              ... 
Executive director, NARAL Pro-Choice Virginia                    1
State House speaker                                              1
Chief Executive Officer for Concerned Veterans for America       1
Solicitor General                                                1
President, The Whitman Strategy Group                            1
Name: count, Length: 1186, dtype: int64

In [8]:
train['speaker'].value_counts()

speaker
barack-obama                                   493
donald-trump                                   274
hillary-clinton                                239
mitt-romney                                    180
scott-walker                                   150
                                              ... 
lorraine-fende                                   1
nfederation-o-independent-business-virginia      1
jim-moore                                        1
penny-pritzker                                   1
alan-powell                                      1
Name: count, Length: 2915, dtype: int64

In [9]:
train['label'].value_counts()

label
half-true      2123
false          1997
mostly-true    1966
true           1683
barely-true    1657
pants-fire      842
Name: count, dtype: int64

In [10]:
train['subject'].unique()

array(['energy,history,job-accomplishments', 'foreign-policy',
       'health-care', ..., 'animals,elections',
       'retirement,social-security', 'florida,foreign-policy'],
      dtype=object)

In [11]:
test.head()

Unnamed: 0,label,statement,subject,speaker,speaker_job,state,party,context
0,--,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,a news conference
1,--,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,comments on ABC's This Week.
2,--,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,a radio show
3,--,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,a web video
4,--,Over the past five years the federal governmen...,"federal-budget,pensions,retirement",brendan-doherty,,Rhode Island,republican,a campaign website


In [12]:
label = train.label
train = train.drop(['label'], axis = 1)

In [13]:
test = test.drop(['label'], axis = 1)

In [14]:
label = label.map({'pants-fire': 1, 'half-true': 0, 'false': 0, 'mostly-true': 0, 'true': 0, 'barely-true': 0})

In [15]:
colnames = list(train.columns)

In [16]:
train = train.fillna("")
test = test.fillna("")

### Target encoder for state and party columns

In [17]:
def target_encode(X, label):
    encoder = TargetEncoder()
    X[['state', 'party']] = encoder.fit_transform(X[['state', 'party']], label)
    return X

### Length of sentences extractor (for statement column)

In [18]:
def extract_text_features(s):
    s = s.astype('str')
    n = s.str.len().values
    #n = np.log1p(n)
    return np.column_stack([n])

### One hot encoder

In [19]:
def one_hot_encoder(X):
    X = X.str.split(',')
    all_types = set(i for sublist in X.dropna() for i in sublist if i)
    return pd.DataFrame({f'{j}': X.apply(lambda x: int(j in x) if isinstance(x, list) else 0) for j in all_types})

### Lemmatizer for statement

In [20]:
def lemmatize_pos(x):
    l = []
    for d in x:
        l.append(" ".join(t.lemma_ + " " + t.tag_ for t in d))
    return l

### NER for statement

In [21]:
def ner(x):
    l = []
    for d in x:
        l.append(" ".join(e.label_ for e in d.ents))
    return l

In [22]:
nlp = spacy.load("en_core_web_lg")
train["statement_spacy"] = list(nlp.pipe(train.statement))
test["statement_spacy"] = list(nlp.pipe(test.statement))

In [23]:
LemmatizerPos = FunctionTransformer(lemmatize_pos)
Nerer = FunctionTransformer(ner)

### Bert tokenizer for statement

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embedding

statement_bert = np.array([get_bert_embedding(statement) for statement in train['statement']])
statement_bert_test = np.array([get_bert_embedding(statement) for statement in test['statement']])

In [None]:
def minmax_scaler(X, feature_range=(0, 1)):
    X_min = np.min(X, axis=0)
    X_max = np.max(X, axis=0)
    
    X_scaled = (X - X_min) / (X_max - X_min)
    X_scaled = X_scaled * (feature_range[1] - feature_range[0]) + feature_range[0]
    
    return X_scaled

In [None]:
def standard_scaler(X):
    X_scaled = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    
    return X_scaled

In [None]:
statement_scaled = minmax_scaler(pd.DataFrame(statement_bert))
statement_scaled_test = minmax_scaler(pd.DataFrame(statement_bert_test))

In [None]:
train_bert = pd.concat([train, statement_scaled], axis=1)
train_bert.columns = train_bert.columns.astype(str)
test_bert = pd.concat([test, statement_scaled_test], axis=1)
test_bert.columns = test_bert.columns.astype(str)

## Pipeline

In [25]:
text_feature_pipeline = Pipeline([
    ('function', FunctionTransformer(func=extract_text_features, validate=False)),
    ('scaler', MinMaxScaler())
])

In [26]:
ct_tfidf = ColumnTransformer(
    transformers=[
        ('function', text_feature_pipeline, 'statement'),
        #('statement', TfidfVectorizer(max_features=500, stop_words='english'), 'statement'),
        ("statement", make_pipeline(LemmatizerPos, TfidfVectorizer()), "statement_spacy"),
        ("statement_ner", make_pipeline(Nerer, CountVectorizer()), "statement_spacy"),
        ('subject', TfidfVectorizer(max_features=500, stop_words='english'), 'subject'),
        ('speaker', TfidfVectorizer(max_features=500, stop_words='english'), 'speaker'),
        ('speaker_job', TfidfVectorizer(max_features=500, stop_words='english'), 'speaker_job'),
        ('context', TfidfVectorizer(max_features=500, stop_words='english'), 'context'),
        ('cat', ce.TargetEncoder(), ['state', 'party']),
    ],
    remainder='passthrough'
)

ct_count = ColumnTransformer(
    transformers=[
        ('function', text_feature_pipeline, 'statement'),
        # ('statement', CountVectorizer(stop_words='english'), 'statement'),
        ("statement", make_pipeline(LemmatizerPos, TfidfVectorizer()), "statement_spacy"),
        ("statement_ner", make_pipeline(Nerer, CountVectorizer()), "statement_spacy"),
        ('subject', CountVectorizer(stop_words='english', binary=True), 'subject'),
        ('speaker', CountVectorizer(stop_words='english'), 'speaker'),
        ('speaker_job', CountVectorizer(stop_words='english'), 'speaker_job'),
        ('context', CountVectorizer(stop_words='english'), 'context'),
        ('cat', ce.TargetEncoder(), ['state', 'party']),
    ],
    remainder='passthrough'
)

ct_hashing = ColumnTransformer(
    transformers=[
        ('function', text_feature_pipeline, 'statement'),
        # ('statement', HashingVectorizer(n_features=100), 'statement'),
        ("statement", make_pipeline(LemmatizerPos, TfidfVectorizer()), "statement_spacy"),
        ("statement_ner", make_pipeline(Nerer, CountVectorizer()), "statement_spacy"),
        ('subject', HashingVectorizer(n_features=100), 'subject'),
        ('speaker', HashingVectorizer(n_features=100), 'speaker'),
        ('speaker_job', HashingVectorizer(n_features=100), 'speaker_job'),
        ('context', HashingVectorizer(n_features=100), 'context'),
        ('cat', ce.TargetEncoder(), ['state', 'party']),
    ],
    remainder='passthrough'
)

ct_doc2vec = ColumnTransformer(
    transformers=[
        ('function', text_feature_pipeline, 'statement'),
        #'statement', Doc2VecVectorizer(), 'statement'),
        ("statement", make_pipeline(LemmatizerPos, TfidfVectorizer()), "statement_spacy"),
        ("statement_ner", make_pipeline(Nerer, CountVectorizer()), "statement_spacy"),
        ('subject', Doc2VecVectorizer(), 'subject'),
        ('speaker', Doc2VecVectorizer(), 'speaker'),
        ('speaker_job', Doc2VecVectorizer(), 'speaker_job'),
        ('context', Doc2VecVectorizer(), 'context'),
        ('cat', ce.TargetEncoder(), ['state', 'party']),
    ],
    remainder='passthrough'
)

In [27]:
svd = TruncatedSVD(n_components = 100, random_state = 124)
nmf = NMF(n_components = 100, max_iter=1000, random_state = 124)
lda = LatentDirichletAllocation(n_components = 100, random_state = 124)

In [28]:
union = FeatureUnion([
    ('svd', svd),
    ('nmf', nmf),
    ('identity', 'passthrough')
])

union_svd = FeatureUnion([
    ('svd', svd),
    ('identity', 'passthrough')
])


In [29]:
pipeline_tfidf = Pipeline(steps=[
    ("transform", ct_tfidf),
    ('union', union),
    ('model', LogisticRegression(max_iter = 5000, random_state=124))
])

pipeline_count = Pipeline(steps=[
    ("transform", ct_count),
    ('union', union),
    ('model', LogisticRegression(random_state=124))
])

pipeline_hashing = Pipeline(steps=[
    ("transform", ct_hashing),
    ('union', union_svd),
    ('model', LogisticRegression(random_state=124))
])

pipeline_doc2vec = Pipeline(steps=[
    ("transform", ct_doc2vec),
    ('union', union_svd),
    ('model', LogisticRegression(random_state=124))
])

## Training and testing

In [45]:
# pipeline_tfidf.fit(X_train, y_train)
# y_pred = pipeline_tfidf.predict_proba(X_test)[:, 1]
# roc_auc_score(y_test, y_pred)

0.783980528608919

In [61]:
scores_tfidf = cross_val_score(pipeline_tfidf, train, label, cv=5, scoring='roc_auc')
print("Cross-validation scores:", scores_tfidf)
print("Mean accuracy:", scores_tfidf.mean())

Cross-validation scores: [0.73884954 0.76274858 0.74009386 0.73831628 0.74788114]
Mean accuracy: 0.7455778800610146


In [47]:
pipeline_tfidf.fit(train, label)
y_pred = pipeline_tfidf.decision_function(test)
#y_prob = expit(y_pred)
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv("313549_prediction.csv", index = False, index_label = False)

In [46]:
scores_count = cross_val_score(pipeline_count, train, label, cv=5, scoring='roc_auc')
print("Cross-validation scores:", scores_count)
print("Mean accuracy:", scores_count.mean())

Cross-validation scores: [0.73634992 0.74628726 0.7272927  0.73296072 0.72466212]
Mean accuracy: 0.7335105437591876


In [47]:
scores_hashing = cross_val_score(pipeline_hashing, train, label, cv=5, scoring='roc_auc')
print("Cross-validation scores:", scores_hashing)
print("Mean accuracy:", scores_hashing.mean())

Cross-validation scores: [0.71376496 0.7383862  0.73450002 0.7308987  0.75631237]
Mean accuracy: 0.7347724497626626


In [48]:
scores_doc2vec = cross_val_score(pipeline_doc2vec, train, label, cv=5, scoring='roc_auc')
print("Cross-validation scores:", scores_doc2vec)
print("Mean accuracy:", scores_doc2vec.mean())

Cross-validation scores: [0.71339885 0.76163734 0.71112646 0.73111027 0.72617153]
Mean accuracy: 0.7286888905073028


### Voting

In [35]:
voter = VotingClassifier(estimators=[
        ('tfidf', pipeline_tfidf), ('count', pipeline_count), ('hashing', pipeline_hashing)], voting='soft')

In [36]:
scores_voter = cross_val_score(voter, train, label, cv=5, scoring='roc_auc')


NameError: name 'scores_doc2vec' is not defined

In [37]:
print("Cross-validation scores:", scores_voter)
print("Mean accuracy:", scores_voter.mean())

Cross-validation scores: [0.73609428 0.75873997 0.74149075 0.74132879 0.75146204]
Mean accuracy: 0.7458231661298125


In [43]:
voter.fit(train, label)
y_pred = voter.predict_proba(test)

In [46]:
y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.loc[:,1]

In [49]:
y_pred.to_csv("313549_prediction_voting.txt", sep=" ", index=False)