# Anaphora Resolution Classification

## Libraries Imported

In [270]:
# for turning the csv to importable data
import pandas as pd
import numpy as np

# for scaling and training data
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# for model evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# for classifiers
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# for SMOTE
from collections import Counter
from imblearn.over_sampling import SMOTE

# for hyperparamter optimization
from sklearn.model_selection import GridSearchCV


# for feature importance
from sklearn.feature_selection import mutual_info_classif

import matplotlib as plt

## Adding Classification to the Dataset

In [271]:
data = pd.read_csv('data/it-corpus.tsv', sep='\t')
df = pd.read_csv('./features.csv')

df = pd.concat((df, data['Class']), axis=1)

## Train and Evaluation of the 4 Classifiers

### Setting Train and Test Data

In [272]:
# extracting features and label
X = df.drop(columns=['Class'])
y = df['Class']

# converting non-numeric data to numeric via Label Encoder
for col in X:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

### Decision Trees

In [273]:
dt = DecisionTreeClassifier()

### Random Forest

In [274]:
rf = RandomForestClassifier()

### Logistic Regression

In [275]:
lr = LogisticRegression(max_iter=5000)

### SVM

In [None]:
svm = SVC(kernel='rbf')

### Evaluation

In [277]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [278]:
# evaluation function
def evaluation_score(model, X, y, cv):
	acc = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
	return acc.mean()

In [279]:
models = [dt, rf, lr, svm]

for m in models:
    score = evaluation_score(m, X, y, cv)
    print(f"{type(m).__name__} accuracy: {score}")

DecisionTreeClassifier accuracy: 0.7425716440422323
RandomForestClassifier accuracy: 0.8128582202111614
LogisticRegression accuracy: 0.8343514328808446
SVC accuracy: 0.7934012066365007


## Applying SMOTE

### Original Dataset Shape

In [280]:
print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({'NomAnaph': 428, 'ClauseAnaph': 85})


### SMOTE

In [281]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

### New Dataset Shape

In [282]:
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({'NomAnaph': 428, 'ClauseAnaph': 428})


### SMOTE Evaluation

In [283]:
models = [dt, rf, lr, svm]

for m in models:
    score = evaluation_score(m, X_res, y_res, cv)
    print(f"{type(m).__name__} accuracy: {score}")

DecisionTreeClassifier accuracy: 0.7652257181942544
RandomForestClassifier accuracy: 0.8247332421340628
LogisticRegression accuracy: 0.6938714090287278
SVC accuracy: 0.49642954856361154


## Hyper-parameter optimization

### Decision Trees

In [284]:
param_dt = {
    "ccp_alpha": [0.1, 0.2, 0.3, 0.4, 0.5],
    "min_samples_leaf": [2, 3, 4, 5, 6, 7, 8, 9, 10]
}
grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_dt,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)
grid_dt.fit(X_res, y_res)
print("Decision Tree best params:", grid_dt.best_params_)
print("Decision Tree best CV accuracy:", grid_dt.best_score_)

Decision Tree best params: {'ccp_alpha': 0.1, 'min_samples_leaf': 2}
Decision Tree best CV accuracy: 0.49764705882352944


### Random Forest

In [285]:
param_rf = {
    "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "max_features": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
}
grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_rf,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)
grid_rf.fit(X_res, y_res)
print("Random Forest best params:", grid_rf.best_params_)
print("Random Forest best CV accuracy:", grid_rf.best_score_)

KeyboardInterrupt: 

### Logistic Regression

In [None]:
c_values_raw = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
c_values = [1 / val for val in c_values_raw]

param_lr = {
    "C": c_values
}
grid_lr = GridSearchCV(
    estimator=lr,
    param_grid=param_lr,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)
grid_lr.fit(X_res, y_res)
print("Logistic Regression best params:", grid_lr.best_params_)
print("Logistic Regression best CV accuracy:", grid_lr.best_score_)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

Logistic Regression best params: {'C': 10000000000.0}
Logistic Regression best CV accuracy: 0.698577291381669


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### SVM

In [None]:
param_svm = {
    "C": [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    "gamma": [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
}
grid_svm = GridSearchCV(
    estimator=svm,
    param_grid=param_svm,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)
grid_svm.fit(X_res, y_res)
print("SVM best params:", grid_svm.best_params_)
print("SVM best CV accuracy:", grid_svm.best_score_)

SVM best params: {'C': 100.0, 'gamma': 0.01}
SVM best CV accuracy: 0.8200957592339261


## Feature importance

In [None]:
mi = mutual_info_classif(X_res, y_res, random_state=42)

mi_df = pd.DataFrame({
    'Feature': X.columns,
    'Information_Gain': mi
}).sort_values(by='Information_Gain', ascending=False)

In [None]:
print(mi_df)

   Feature  Information_Gain
6       F7          0.107022
1       F2          0.086569
0       F1          0.060982
2       F3          0.060500
17     F18          0.051769
4       F5          0.045935
15     F16          0.031378
13     F14          0.013227
7       F8          0.012255
3       F4          0.009424
11     F12          0.007024
19     F20          0.006442
9      F10          0.005156
5       F6          0.003973
12     F13          0.000000
14     F15          0.000000
8       F9          0.000000
16     F17          0.000000
18     F19          0.000000
10     F11          0.000000


In [None]:
# Combine features (X) and labels (y) back into one DataFrame
matrix = X.copy()
matrix['Class'] = y

# Save to CSV
matrix.to_csv('feature_matrix.csv', index=False)