In [58]:
import pandas as pd
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import optuna
from sklearn.model_selection import cross_val_score

### Step 1 - Import Train and Test Data

In [2]:
# Load training set from CSV
train_data = pd.read_csv("train_data.csv")
# Load test set from CSV
test_data = pd.read_csv("test_data.csv")

### Step 2 - Pre-process Text

In [6]:
# Apply simple pre-processing function to train data
train_data['train_tokens'] = train_data['modified_clause_text'].apply(utils.simple_preprocess)
# Apply simple pre-processing function to test data
test_data['test_tokens'] = test_data['modified_clause_text'].apply(utils.simple_preprocess)

### Step 3 - Tag the Documents

In [13]:
# Produce unique identifier for Doc2Vec to work on document using index as unique tag
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(train_data['train_tokens'])]

### Step 4 - Function to Train the Doc2Vec model and Extract Vectors for Train and Test sets

In [44]:
# Initialize the Doc2Vec model using PV-DM (which is like CBOW) 

def trainD2VModel(tagged_docs, vectorsize, windowsize, mincount, worker_no, dm, epochs_no):

    model = Doc2Vec(
    vector_size=vectorsize,   # Dimensionality of the vectors
    window=windowsize,          # Context window size
    min_count=mincount,       # Ignores all words with total frequency lower than this
    workers=worker_no,         # Number of parallel threads
    dm=1               # PV-DM (CBOW-like) mode
    )

    # Build the vocabulary from the tagged data
    model.build_vocab(tagged_data)
        
    # Train the model
    model.train(tagged_data, total_examples=model.corpus_count, epochs=epochs_no)

    return model

def vectors_and_labels(tagged_data, test_docs, model):
    X_train = [model.dv[i] for i in range(len(tagged_data))]
    y_train = train_data["clause_type"]
    X_test = [model.infer_vector(doc) for doc in test_docs]
    y_test = test_data["clause_type"]

    return X_train, y_train, X_test, y_test


### Step 5 - Run functions and train Classifier


In [67]:
tagged_docs = train_data['train_tokens']    
test_docs = test_data['test_tokens']

model = trainD2VModel(tagged_docs,  vectorsize=100, windowsize=5, mincount=2, worker_no=4, dm=1, epochs_no=5)
X_train, y_train, X_test, y_test = vectors_and_labels(tagged_data,test_docs, model)
clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)

### Step 6 - Predict labels based on classifier and calc accuracy

In [68]:
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.38847583643122674


### Step 7 - Hyperparameter Optimisation  

In [None]:
def objective(trial):
        # Define hyperparameters to tune
        #Doc2Vec Hyperparameters
        vectorsize = trial.suggest_int('vector size',20,200)
        windowsize = trial.suggest_int('window size',2,10)
        mincount = trial.suggest_int('min_count',2,10)
        worker_no =trial.suggest_int('min_count',2,10)
        dm = trial.suggest_int('dm',0,1)
        epochs = trial.suggest_int('epochs no',10,100)
        
        # Decision Tree Hyperparameters      
        criterion  = trial.suggest_categorical('criterion', ['gini', 'entropy','log_loss'])
        splitter  = trial.suggest_categorical('splitter', ['best', 'random'])
        max_depth = trial.suggest_int('max depth',2,20)
        min_samples_split = trial.suggest_int('min samples split',2,20)
        min_samples_leaf = trial.suggest_int('min samples leaf',2,20)
        min_weight_fraction_leaf = trial.suggest_float('min weight fraction leaf',0.0, 2.0)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        max_leaf_nodes =trial.suggest_int('max leaf nodes',None,100), 
        min_impurity_decrease =trial.suggest_float('min impurity decrease',0.0, 0.001) 
        class_weight=trial.suggest_categorical('class weight', ['balanced', None]) 
        ccp_alpha=trial.suggest_float('ccp alpha',0.0,0.01)
                
        # Create pipeline
        pipeline = Pipeline([
            ('train Dpoc2Vec', trainD2VModel(tagged_docs = tagged_docs, vectorsize = vectorsize, windowsize = windowsize, mincount = mincount, worker_no = worker_no, dm = dm, epochs_no = epochs)),
            ('Vectoriser', vectors_and_labels(tagged_data = tagged_data, test_docs= test_docs, model = model)),
            ('DTree Classifier', DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, ccp_alpha=ccp_alpha)),
        ])
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
        score = cv_scores.mean()  # use mean accuracy as the score
    
    return score

# Create a study to optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

print("Best trial:", study.best_trial.params,"Accuracy score:", study.best_trial.value)