## Relatable Old Model

In [13]:
import pandas as pd
import nltk
import sklearn as sk
import string
from time import perf_counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from joblib import dump, load
import numpy as np
import pickle

np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.10f' % x)

In [14]:
def pre_process(s):
    # remove punctuation
    s = "".join([char for char in s if char not in string.punctuation])

    # filter stopwords
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    s_tokenized = nltk.word_tokenize(s)
    s_tokenized_stopwords_removed = [word for word in s_tokenized if word not in stopwords]
    s_tokenized_stopwords_removed_joined = " ".join(s_tokenized_stopwords_removed)
    return s_tokenized_stopwords_removed_joined

In [15]:
def tokenize_tag_combine(s):
    s_tokenized = nltk.word_tokenize(s)
    s_pos_tagged = nltk.pos_tag(s_tokenized)

    new_s = ''
    for word, pos in s_pos_tagged:
        new_s = new_s + word + "_" + pos + " "

    return new_s

In [16]:
def prepare_data():
    # read in data
    df = pd.read_csv(r"../classifier/data/combined.csv")
    df.drop("Unnamed: 0", axis=1, inplace=True)

    # get part of speech
    test_string = "This is a test string."
    test_string = pre_process(test_string)
    test_string = tokenize_tag_combine(test_string)

    df['pos_tagged'] = df['text'].apply(tokenize_tag_combine)
    

    df.to_csv(r"../classifier/data/combined_pos_tagged.csv")
    print(df.head())

In [17]:
n_gram_size = 1
df = pd.read_csv("../classifier/data/combined_pos_tagged.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df = df[df['label'].notna()]

In [18]:
# print(df.columns)
df.head()

Unnamed: 0,text,label,pos_tagged
0,exposure to violent video games causes at leas...,0.0,exposure_NN to_TO violent_VB video_NN games_NN...
1,video game violence is not related to serious ...,0.0,video_NN game_NN violence_NN is_VBZ not_RB rel...
2,some violent video games may actually have a p...,0.0,some_DT violent_JJ video_NNS games_NNS may_MD ...
3,exposure to violent video games causes both sh...,0.0,exposure_NN to_TO violent_VB video_NN games_NN...
4,they increase the violent tendencies among youth,0.0,they_PRP increase_VBP the_DT violent_JJ tenden...


In [19]:
X = df['pos_tagged'].values.astype('U')
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [20]:
lr_classifier = Pipeline(
                            [
                                ('tvect', TfidfVectorizer(ngram_range=(1, n_gram_size))),
                                ('ttrans', TfidfTransformer()),
                                ('scaler', preprocessing.StandardScaler(with_mean=False)),
                                ('cls', LogisticRegression(class_weight='balanced', dual=True, solver='liblinear', max_iter=10000))
                            ]
                        )

lr_classifier.fit(X_train, y_train)



Pipeline(steps=[('tvect', TfidfVectorizer()), ('ttrans', TfidfTransformer()),
                ('scaler', StandardScaler(with_mean=False)),
                ('cls',
                 LogisticRegression(class_weight='balanced', dual=True,
                                    max_iter=10000, solver='liblinear'))])

In [21]:
#dump(lr_classifier, r'../classifier/models/model_{}-gram.pkl'.format(n_gram_size))
#dump(lr_classifier, r'../classifier/models/model_{}-gram.joblib'.format(n_gram_size))

import gzip, pickle
with gzip.open('../classifier/models/model_test.pkl', 'wb') as ofp:
    pickle.dump(lr_classifier, ofp)

In [22]:
y_pred = lr_classifier.predict(X_test)
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

# TODO Need to find a way to oneHotEncode labels
# print("ROC score: {}".format(roc_auc_score(y_test, y_pred, multi_class='ovr')))

Accuracy: 0.9253772060520848


In [23]:
print("Classification Report:")
print(classification_report(y_test, y_pred))
print()
print()

Classification Report:
              precision    recall  f1-score   support

         0.0       0.25      0.33      0.29       688
         1.0       0.34      0.56      0.42      1408
         2.0       0.98      0.95      0.97     45281
         3.0       0.01      0.01      0.01       276

    accuracy                           0.93     47653
   macro avg       0.39      0.47      0.42     47653
weighted avg       0.95      0.93      0.93     47653





In [None]:
import gzip
# lr_classifier = load('../classifier/models/model_{}-gram.joblib'.format(n_gram_size))
# lr_classifier = pickle.load(open('../classifier/models/model_{}-gram.pkl'.format(n_gram_size), 'rb'))

lr_classifier = None
with gzip.open('../classifier/models/model_test.pkl', 'rb') as ifp:
    lr_classifier = pickle.load(ifp)

In [None]:
new_test_string = "exposure to violent video games causes at least a temporary increase in aggression"

print("Raw string input: {}".format(new_test_string))

In [None]:
new_test_string = pre_process(new_test_string)
new_test_string = tokenize_tag_combine(new_test_string)
new_val = lr_classifier.predict_proba([new_test_string])

print("Preprocessed string input: {}".format(new_test_string))

In [None]:
new_val_df = pd.DataFrame(new_val, columns=['claim', 'premise', 'both', 'neither'])

print("Probability matrix:")
print(new_val_df.head())
print()
print("Probability sum: {}".format(np.sum(new_val)))

In [None]:
print("Cross validating")

accuracy = cross_val_score(lr_classifier, X, y, scoring='accuracy', cv=10)

print("Cross validation array: {}".format(accuracy))
print("Cross validation average: {}".format(np.mean(accuracy) * 100))

## Grabbing the training / testing data

In [2]:
import pandas as pd

evidence_df = pd.read_csv(r'../IBM_Debater_(R)_CE-EMNLP-2015.v3/current_working_dataset/evidence.csv')

In [3]:
evidence_df

Unnamed: 0,Topic,Claim original text,Evidence,Type of Evidence Information
0,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,A 2001 study found that exposure to violent vi...,[STUDY]
1,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,The most recent large scale meta-anlysis-- exa...,[STUDY]
2,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,One study did find an increase in reports of b...,[STUDY]
3,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,when one combines all relevant empirical studi...,[STUDY]
4,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,Research published in 2001 suggested that viol...,[STUDY]
...,...,...,...,...
4687,This house would enforce term limits on the le...,The lack of mandatory limits to tenure is rega...,"The historian Mercy Otis Warren, warned that ""...",[EXPERT]
4688,This house would enforce term limits on the le...,contact with the affairs of state is one of th...,"According to historian Garrett Fagan, office h...",[EXPERT]
4689,This house would enforce term limits on the le...,contact with the affairs of state is one of th...,"James Fenimore Cooper, the novelist, described...",[EXPERT]
4690,This house would enforce term limits on the le...,term limits continue to garner popular support,"As of 2002, U.S. Term Limits found that in the...",[STUDY]


### 2 Possibilities
    - Support exists: n1 -> n2
    - Support exists: n2 -> n1

In [17]:
from copy import deepcopy

n1_n2_df = deepcopy(evidence_df)
n2_n1_df = deepcopy(evidence_df)

In [18]:
n1_n2_df["Statement"]  = n1_n2_df["Claim original text"] + " " + n1_n2_df["Evidence"]
n1_n2_df["Label"] = "n1 -> n2"
n1_n2_df

Unnamed: 0,Topic,Claim original text,Evidence,Type of Evidence Information,Statement,Label
0,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,A 2001 study found that exposure to violent vi...,[STUDY],Exposure to violent video games causes at leas...,n1 -> n2
1,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,The most recent large scale meta-anlysis-- exa...,[STUDY],Exposure to violent video games causes at leas...,n1 -> n2
2,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,One study did find an increase in reports of b...,[STUDY],Exposure to violent video games causes at leas...,n1 -> n2
3,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,when one combines all relevant empirical studi...,[STUDY],Exposure to violent video games causes at leas...,n1 -> n2
4,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,Research published in 2001 suggested that viol...,[STUDY],Exposure to violent video games causes at leas...,n1 -> n2
...,...,...,...,...,...,...
4687,This house would enforce term limits on the le...,The lack of mandatory limits to tenure is rega...,"The historian Mercy Otis Warren, warned that ""...",[EXPERT],The lack of mandatory limits to tenure is rega...,n1 -> n2
4688,This house would enforce term limits on the le...,contact with the affairs of state is one of th...,"According to historian Garrett Fagan, office h...",[EXPERT],contact with the affairs of state is one of th...,n1 -> n2
4689,This house would enforce term limits on the le...,contact with the affairs of state is one of th...,"James Fenimore Cooper, the novelist, described...",[EXPERT],contact with the affairs of state is one of th...,n1 -> n2
4690,This house would enforce term limits on the le...,term limits continue to garner popular support,"As of 2002, U.S. Term Limits found that in the...",[STUDY],term limits continue to garner popular support...,n1 -> n2


In [19]:
n1_n2_df.drop(["Topic", "Claim original text", "Evidence", "Type of Evidence Information"], axis=1, inplace=True)
n1_n2_df

Unnamed: 0,Statement,Label
0,Exposure to violent video games causes at leas...,n1 -> n2
1,Exposure to violent video games causes at leas...,n1 -> n2
2,Exposure to violent video games causes at leas...,n1 -> n2
3,Exposure to violent video games causes at leas...,n1 -> n2
4,Exposure to violent video games causes at leas...,n1 -> n2
...,...,...
4687,The lack of mandatory limits to tenure is rega...,n1 -> n2
4688,contact with the affairs of state is one of th...,n1 -> n2
4689,contact with the affairs of state is one of th...,n1 -> n2
4690,term limits continue to garner popular support...,n1 -> n2


In [20]:
n2_n1_df["Statement"]  = n2_n1_df["Evidence"] + " " + n2_n1_df["Claim original text"]
n2_n1_df["Label"] = "n2 -> n1"
n2_n1_df

Unnamed: 0,Topic,Claim original text,Evidence,Type of Evidence Information,Statement,Label
0,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,A 2001 study found that exposure to violent vi...,[STUDY],A 2001 study found that exposure to violent vi...,n2 -> n1
1,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,The most recent large scale meta-anlysis-- exa...,[STUDY],The most recent large scale meta-anlysis-- exa...,n2 -> n1
2,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,One study did find an increase in reports of b...,[STUDY],One study did find an increase in reports of b...,n2 -> n1
3,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,when one combines all relevant empirical studi...,[STUDY],when one combines all relevant empirical studi...,n2 -> n1
4,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,Research published in 2001 suggested that viol...,[STUDY],Research published in 2001 suggested that viol...,n2 -> n1
...,...,...,...,...,...,...
4687,This house would enforce term limits on the le...,The lack of mandatory limits to tenure is rega...,"The historian Mercy Otis Warren, warned that ""...",[EXPERT],"The historian Mercy Otis Warren, warned that ""...",n2 -> n1
4688,This house would enforce term limits on the le...,contact with the affairs of state is one of th...,"According to historian Garrett Fagan, office h...",[EXPERT],"According to historian Garrett Fagan, office h...",n2 -> n1
4689,This house would enforce term limits on the le...,contact with the affairs of state is one of th...,"James Fenimore Cooper, the novelist, described...",[EXPERT],"James Fenimore Cooper, the novelist, described...",n2 -> n1
4690,This house would enforce term limits on the le...,term limits continue to garner popular support,"As of 2002, U.S. Term Limits found that in the...",[STUDY],"As of 2002, U.S. Term Limits found that in the...",n2 -> n1


In [21]:
n2_n1_df.drop(["Topic", "Claim original text", "Evidence", "Type of Evidence Information"], axis=1, inplace=True)
n2_n1_df

Unnamed: 0,Statement,Label
0,A 2001 study found that exposure to violent vi...,n2 -> n1
1,The most recent large scale meta-anlysis-- exa...,n2 -> n1
2,One study did find an increase in reports of b...,n2 -> n1
3,when one combines all relevant empirical studi...,n2 -> n1
4,Research published in 2001 suggested that viol...,n2 -> n1
...,...,...
4687,"The historian Mercy Otis Warren, warned that ""...",n2 -> n1
4688,"According to historian Garrett Fagan, office h...",n2 -> n1
4689,"James Fenimore Cooper, the novelist, described...",n2 -> n1
4690,"As of 2002, U.S. Term Limits found that in the...",n2 -> n1


In [23]:
combined_edge_detection_features_df = pd.concat([n1_n2_df, n2_n1_df])
combined_edge_detection_features_df.to_csv(r"../IBM_Debater_(R)_CE-EMNLP-2015.v3/current_working_dataset/edge_detection_features.csv")