In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
column_names=[
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.

        'context' # Column 14: the context (venue / location of the speech or statement).
]

# The data is already split into train, test, and validation sets.
# [REFERENCE]: https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
# We are using this for convenience and consistency
train_data = pd.read_csv('./liar_dataset/train.tsv', sep='\t', header=None, names=column_names)
test_data  = pd.read_csv('./liar_dataset/test.tsv',  sep='\t', header=None, names=column_names)
valid_data = pd.read_csv('./liar_dataset/valid.tsv', sep='\t', header=None, names=column_names)

In [3]:
def drop_columns(data: pd.DataFrame) -> pd.DataFrame:
    """Drops columns that are not useful for training"""
    data.drop(columns=[f'count_{i+1}' for i in range(5)], inplace=True)
    return data

train_data_bin = drop_columns(train_data)
test_data_bin  = drop_columns(test_data)
valid_data_bin = drop_columns(valid_data)

In [4]:
def binarize_labels(data_df: pd.DataFrame,six_way:bool=False) -> pd.DataFrame:
    """Reduces 6-way classification to binary classification"""
    # We may change this if we want
    # false_labels= ['barely-true', 'false', 'pants-fire']
    data= data_df.copy(deep=True)
    if not six_way:
        true_labels= ['true', 'mostly-true', 'half-true']
        data['label'] = data['label'].apply(lambda x: 1 if x in true_labels else 0)
    else:
        numerical={'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
        data['label'] = data['label'].map(numerical)
    
    return data

In [5]:
train_data_6= binarize_labels(train_data, six_way=True)
test_data_6= binarize_labels(test_data, six_way=True)
valid_data_6= binarize_labels(valid_data, six_way=True)

In [6]:
## make all the columns string


In [49]:
"""X_label_6 = train_data_6['label']
X_statement_6 = train_data_6['statement']
X_subjects_6 = train_data_6['subjects']
X_speaker_6 = train_data_6['speaker']
X_speaker_job_title_6 = train_data_6['speaker_job_title']
X_state_info_6 = train_data_6['state_info']
X_party_affiliation_6 = train_data_6['party_affiliation']
X_context_6 = train_data_6['context']

X_label_6_test = test_data_6['label']
X_statement_6_test = test_data_6['statement']
X_subjects_6_test = test_data_6['subjects']
X_speaker_6_test = test_data_6['speaker']
X_speaker_job_title_6_test = test_data_6['speaker_job_title']
X_state_info_6_test = test_data_6['state_info']
X_party_affiliation_6_test = test_data_6['party_affiliation']
X_context_6_test = test_data_6['context']

X_label_6_valid = valid_data_6['label']
X_statement_6_valid = valid_data_6['statement']
X_subjects_6_valid = valid_data_6['subjects']
X_speaker_6_valid = valid_data_6['speaker']
X_speaker_job_title_6_valid = valid_data_6['speaker_job_title']
X_state_info_6_valid = valid_data_6['state_info']
X_party_affiliation_6_valid = valid_data_6['party_affiliation']
X_context_6_valid = valid_data_6['context']"""

train_data_6_f = train_data_6.drop(columns=['label'])
test_data_6_f = test_data_6.drop(columns=['label'])
valid_data_6_f = valid_data_6.drop(columns=['label'])

train_data_6_f['features'] = train_data_6.astype(str)[['statement', 'subjects', 'speaker', 'speaker_job_title', 'state_info', 'context']].apply(lambda x: ' '.join(x), axis = 1)
test_data_6_f['features'] = test_data_6.astype(str)[['statement', 'subjects', 'speaker', 'speaker_job_title', 'state_info', 'context']].apply(lambda x: ' '.join(x), axis = 1)
valid_data_6_f['features'] = valid_data_6.astype(str)[['statement', 'subjects', 'speaker', 'speaker_job_title', 'state_info', 'context']].apply(lambda x: ' '.join(x), axis = 1)

In [50]:
train_data_6_f['republican'] = train_data_6_f['party_affiliation'].apply(lambda x: 1 if x == 'republican' else 0)
test_data_6_f['republican'] = test_data_6_f['party_affiliation'].apply(lambda x: 1 if x == 'republican' else 0)
valid_data_6_f['republican'] = valid_data_6_f['party_affiliation'].apply(lambda x: 1 if x == 'republican' else 0)

train_data_6_f

Unnamed: 0,id,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,context,features,republican
0,2635.json,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,a mailer,Says the Annies List political group supports ...,1
1,10540.json,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,a floor speech.,When did the decline of coal start? It started...,0
2,324.json,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,Denver,"Hillary Clinton agrees with John McCain ""by vo...",0
3,1123.json,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,a news release,Health care reform legislation is likely to ma...,0
4,9028.json,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,an interview on CNN,The economic turnaround started at the end of ...,0
...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,"interview on ""The Colbert Report""",There are a larger number of shark attacks in ...,0
10236,3408.json,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,an interview,Democrats have now become the party of the [At...,1
10237,3959.json,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,a Republican presidential debate,Says an alternative to Social Security that op...,1
10238,2253.json,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,a televised debate on Miami's WPLG-10 against ...,On lifting the U.S. Cuban embargo and allowing...,0


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data_6_f['features'])
X_test = vectorizer.transform(test_data_6_f['features'])
X_valid = vectorizer.transform(valid_data_6_f['features'])


y_train = train_data_6['label']
y_test = test_data_6['label']
y_valid = valid_data_6['label']








In [52]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_valid)

from sklearn.metrics import accuracy_score
accuracy_score(y_valid, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_valid, y_pred))

from sklearn.metrics import confusion_matrix
confusion_matrix(y_valid, y_pred)

              precision    recall  f1-score   support

           0       0.65      0.50      0.56       616
           1       0.62      0.75      0.68       668

    accuracy                           0.63      1284
   macro avg       0.63      0.62      0.62      1284
weighted avg       0.63      0.63      0.62      1284



array([[306, 310],
       [165, 503]])

In [53]:
X_train[0].toarray().flatten()

X_train_f = np.array([x.toarray().flatten() for x in X_train])

X_train_f_mean = np.array([np.mean(x) for x in X_train_f])

X_train_f_df = pd.DataFrame(X_train_f_mean)

print(X_train_f_df)

X_test[0].toarray().flatten()

X_test_f = np.array([x.toarray().flatten() for x in X_test])

X_test_f_mean = np.array([np.mean(x) for x in X_test_f])

X_test_f_df = pd.DataFrame(X_test_f_mean)

print(X_test_f_df)

X_valid[0].toarray().flatten()
X_valid_f = np.array([x.toarray().flatten() for x in X_valid])
X_valid_f_mean = np.array([np.mean(x) for x in X_valid_f])
X_valid_f_df = pd.DataFrame(X_valid_f_mean)


              0
0      0.000722
1      0.001004
2      0.000915
3      0.000771
4      0.000792
...         ...
10235  0.000837
10236  0.000807
10237  0.000932
10238  0.000850
10239  0.001052

[10240 rows x 1 columns]
             0
0     0.000777
1     0.000760
2     0.000934
3     0.000909
4     0.001022
...        ...
1262  0.000805
1263  0.000842
1264  0.001109
1265  0.000987
1266  0.000986

[1267 rows x 1 columns]


In [61]:
print(X_train_f.shape)

(10240, 5000)


In [64]:
from econml.dml import CausalForestDML
from sklearn.ensemble import RandomForestRegressor

cf = CausalForestDML(
    discrete_treatment=True,
    max_depth=15,             
    n_estimators=200,         
    min_samples_split=10,     
    min_samples_leaf=5,       
    random_state=42,           
)

cf.fit(Y=y_train, X=X_train_f_df, T=train_data_6_f['republican'], cache_values=True)
print(cf.summary())
y_pred=cf.effect(X_train_f_df)
print(y_pred)                  

Not all column names are strings. Coercing to strings for now.


Population summary of CATE predictions on Training Data
               Uncertainty of Mean Point Estimate               
mean_point stderr_mean zstat  pvalue ci_mean_lower ci_mean_upper
----------------------------------------------------------------
    -0.108       0.116 -0.934   0.35        -0.336         0.119
      Distribution of Point Estimate     
std_point pct_point_lower pct_point_upper
-----------------------------------------
    0.115          -0.323           0.131
     Total Variance of Point Estimate     
stderr_point ci_point_lower ci_point_upper
------------------------------------------
       0.164         -0.425          0.226
        Doubly Robust ATE on Training Data Results        
    point_estimate stderr  zstat  pvalue ci_lower ci_upper
----------------------------------------------------------
ATE         -0.107   0.01 -10.846    0.0   -0.126   -0.088
     Doubly Robust ATT(T=0) on Training Data Results     
    point_estimate stderr zstat  pvalue ci_lower c

In [None]:
from econml.dml import CausalForestDML
from sklearn.ensemble import RandomForestRegressor

cf = CausalForestDML(model_t=RandomForestRegressor(random_state=0),
                     model_y=RandomForestRegressor(random_state=0),
                     criterion='mse', 
                     n_estimators=1000,
                     min_impurity_decrease=0.001,random_state=0)

cf.fit(Y=y_train, X=X_train_f_df, T=train_data_6_f['republican'])
lb, ub = cf.effect_interval(X_train_f_df, alpha=0.01)
y_effect=cf.effect(X_train_f_df)
y_eff_test = cf.effect(X_test_f_df)
y_eff_valid = cf.effect(X_valid_f_df)
print(y_effect.shape)
##y_pred = np.array([1 if x > 0 else 0 for x in y_pred])  

In [55]:
train_effect = y_effect
test_effect = y_eff_test
valid_effect = y_eff_valid

In [57]:
train_data_6_f['effect'] = train_effect
test_data_6_f['effect'] = test_effect
valid_data_6_f['effect'] = valid_effect


In [None]:
# Step 3: Preprocess the Dataset
def preprocess_function(examples):
    return tokenizer(examples["statement"], padding="max_length", truncation=True, return_tensors="pt")

tokenized_dataset= dataset_dict.map(preprocess_function, batched=True, remove_columns=['statement'])