In [196]:
import pandas as pd
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import ast

### Step 1 - Import Train and Test Data

In [121]:
# Load training set from CSV
train_data = pd.read_csv("train_data.csv")
# Load test set from CSV
test_data = pd.read_csv("test_data.csv")

### Step 2 - Tag the Documents and Produce Lists

In [143]:
train_data['preprocessed_text'] = train_data['preprocessed_text'].apply(ast.literal_eval)
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(train_data['preprocessed_text'])]
test_data['preprocessed_text'] = test_data['preprocessed_text'].apply(ast.literal_eval)

### Step 3 - Function to Train the Doc2Vec model and Extract Vectors for Train and Test sets

In [134]:
# Initialize the Doc2Vec model using PV-DM (which is like CBOW) 

def get_vectors(tagged_docs, vectorsize, windowsize, mincount, worker_no, dm, epochs_no):
    
    model = Doc2Vec(
    vector_size=vectorsize,   # Dimensionality of the vectors
    window=windowsize,          # Context window size
    min_count=mincount,       # Ignores all words with total frequency lower than this
    workers=worker_no,         # Number of parallel threads
    dm=1               # PV-DM (CBOW-like) mode
    )

    # Build the vocabulary from the tagged data
    model.build_vocab(tagged_data)
        
    # Train the model
    model.train(tagged_data, total_examples=model.corpus_count, epochs=epochs_no)
    test_docs = test_data['preprocessed_text']
    X_train = [model.dv[i] for i in range(len(tagged_docs))]
    y_train = train_data["clause_type"]
    X_test = [model.infer_vector(doc) for doc in test_docs]
    y_test = test_data["clause_type"]

    return X_train, y_train, X_test, y_test


### Step 4 - Run functions and train Classifier


In [189]:
#tagged_docs = train_data['train_tokens']    
#test_docs = test_data['test_tokens']
tagged_docs = train_data['preprocessed_text']    
test_docs = test_data['preprocessed_text']

X_train, y_train, X_test, y_test = get_vectors(tagged_docs,  vectorsize=100, windowsize=5, mincount=2, worker_no=4, dm=1, epochs_no=5)

clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)

### Step 5 - Predict labels based on classifier and calc accuracy

In [190]:
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.47769516728624534


### Step 6 - Decision Tree Hyperparameter Optimisation  

In [188]:
def objective(trial):
        
        # Decision Tree Hyperparameters      
        criterion  = trial.suggest_categorical('criterion', ['gini', 'entropy','log_loss'])
        splitter  = trial.suggest_categorical('splitter', ['best', 'random'])
        max_depth = trial.suggest_int('max depth',2,40)
        min_samples_split = trial.suggest_int('min samples split',2,20)
        min_samples_leaf = trial.suggest_int('min samples leaf',2,20)
        min_weight_fraction_leaf = trial.suggest_float('min weight fraction leaf',0.0, 0.5)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        max_leaf_nodes =trial.suggest_int('max leaf nodes',2,100) 
        min_impurity_decrease =trial.suggest_float('min impurity decrease',0.0, 0.001) 
        class_weight=trial.suggest_categorical('class weight', ['balanced', None]) 
        ccp_alpha=trial.suggest_float('ccp alpha',0.0,0.01)
                
        # Create pipeline
        pipeline = Pipeline([
            ('DTree Classifier', DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, ccp_alpha=ccp_alpha)),
        ])
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy',error_score='raise')
        score = cv_scores.mean()  # use mean accuracy as the score
    
        return score

# Create a study to optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=400)

print("Best trial:", study.best_trial.params,"Accuracy score:", study.best_trial.value)

[I 2024-09-17 18:29:58,712] A new study created in memory with name: no-name-70d2cdd6-7dda-4fe5-a1a5-8aef739a5493
[I 2024-09-17 18:29:58,807] Trial 0 finished with value: 0.21029555987154586 and parameters: {'criterion': 'entropy', 'splitter': 'best', 'max depth': 12, 'min samples split': 10, 'min samples leaf': 8, 'min weight fraction leaf': 0.30004469493055136, 'max_features': None, 'max leaf nodes': 18, 'min impurity decrease': 0.0005193477291035174, 'class weight': None, 'ccp alpha': 0.004601011473465898}. Best is trial 0 with value: 0.21029555987154586.
[I 2024-09-17 18:29:58,821] Trial 1 finished with value: 0.2276714804915101 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max depth': 37, 'min samples split': 14, 'min samples leaf': 9, 'min weight fraction leaf': 0.21680216504957783, 'max_features': 'sqrt', 'max leaf nodes': 80, 'min impurity decrease': 4.21948142969365e-05, 'class weight': None, 'ccp alpha': 0.008476033952825405}. Best is trial 1 with value: 0.

Best trial: {'criterion': 'entropy', 'splitter': 'best', 'max depth': 22, 'min samples split': 19, 'min samples leaf': 10, 'min weight fraction leaf': 0.0006860122535069519, 'max_features': None, 'max leaf nodes': 27, 'min impurity decrease': 0.0004949977802429552, 'class weight': 'balanced', 'ccp alpha': 0.00516684693813014} Accuracy score: 0.6371151664327057


### Best result so far
Best trial: {'criterion': 'entropy', 'splitter': 'best', 'max depth': 15, 'min samples split': 4, 'min samples leaf': 7, 'min weight fraction leaf': 0.0008017228309184037, 'max_features': None, 'max leaf nodes': 45, 'min impurity decrease': 0.000666189779162299, 'class weight': 'balanced', 'ccp alpha': 0.006046726905517552} Accuracy score: 0.6476857104397823 <br>
##### after improved text pre-processing and 5 epochs on Doc2Vec<br>
Best trial: {'criterion': 'log_loss', 'splitter': 'best', 'max depth': 33, 'min samples split': 16, 'min samples leaf': 14, 'min weight fraction leaf': 0.009735388276899683, 'max_features': None, 'max leaf nodes': 59, 'min impurity decrease': 0.0008763970284558885, 'class weight': None, 'ccp alpha': 0.006813595623196301} Accuracy score: 0.6768071072822721<br>
##### after improved text pre-processing and 20 epochs on Doc2Vec<br>
Best trial: {'criterion': 'entropy', 'splitter': 'best', 'max depth': 11, 'min samples split': 10, 'min samples leaf': 4, 'min weight fraction leaf': 8.609707231782522e-05, 'max_features': None, 'max leaf nodes': 57, 'min impurity decrease': 0.000717076199304285, 'class weight': 'balanced', 'ccp alpha': 0.009209313562777367} Accuracy score: 0.5365786589235235<br>
##### after improved text pre-processing and 3 epochs on Doc2Vec<br>
Best trial: {'criterion': 'entropy', 'splitter': 'best', 'max depth': 22, 'min samples split': 19, 'min samples leaf': 10, 'min weight fraction leaf': 0.0006860122535069519, 'max_features': None, 'max leaf nodes': 27, 'min impurity decrease': 0.0004949977802429552, 'class weight': 'balanced', 'ccp alpha': 0.00516684693813014} Accuracy score: 0.6371151664327057

### Step 7 - Decision Tree Retrain and Run Test


In [191]:
clf = DecisionTreeClassifier( 
        criterion  = 'entropy',
        splitter  = 'best',
        max_depth = 33,
        min_samples_split = 16,
        min_samples_leaf = 14,
        min_weight_fraction_leaf = 0.009735388276899683,
        max_features = None,
        max_leaf_nodes =59, 
        min_impurity_decrease =0.0008763970284558885, 
        class_weight=None, 
        ccp_alpha=0.006813595623196301)
 
clf.fit(X_train, y_train)     
 
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.49070631970260226


### Step 4a - Train Random Forest Classifier

In [192]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

### Step 5a - Random Forest Classifier predict labels based on classifier and calc accuracy

In [193]:
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.70817843866171


### Step 6a - Random Forest Hyperparameter Optimisation

In [194]:
def objective_rf(trial):
        
        # Decision Tree Hyperparameters      
        criterion  = trial.suggest_categorical('criterion', ['gini', 'entropy','log_loss'])
        max_depth = trial.suggest_int('max depth',2,40)
        min_samples_split = trial.suggest_int('min samples split',2,20)
        min_samples_leaf = trial.suggest_int('min samples leaf',1,20)
        min_weight_fraction_leaf = trial.suggest_float('min weight fraction leaf',0.0, 0.5)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        max_leaf_nodes =trial.suggest_int('max leaf nodes',2,200) 
        min_impurity_decrease =trial.suggest_float('min impurity decrease',0.0, 0.001) 
        class_weight=trial.suggest_categorical('class weight', ['balanced', 'balanced_subsample', None]) 
        ccp_alpha=trial.suggest_float('ccp alpha',0.0,0.01)
                
        # Create pipeline
        pipeline = Pipeline([
               ('RandomForestClassifier', RandomForestClassifier(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, ccp_alpha=ccp_alpha)),
        ])
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy',error_score='raise')
        score = cv_scores.mean()  # use mean accuracy as the score
    
        return score

# Create a study to optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective_rf, n_trials=300)

print("Best trial:", study.best_trial.params,"Accuracy score:", study.best_trial.value)

[I 2024-09-17 18:38:13,635] A new study created in memory with name: no-name-6198c570-69aa-4c86-8133-49ab5396b0b1
[I 2024-09-17 18:38:14,355] Trial 0 finished with value: 0.627773397688595 and parameters: {'criterion': 'log_loss', 'max depth': 12, 'min samples split': 3, 'min samples leaf': 20, 'min weight fraction leaf': 0.3899581499695203, 'max_features': 'sqrt', 'max leaf nodes': 114, 'min impurity decrease': 0.00046445478702939727, 'class weight': 'balanced', 'ccp alpha': 0.0033575032567247933}. Best is trial 0 with value: 0.627773397688595.
[I 2024-09-17 18:38:18,366] Trial 1 finished with value: 0.35727169586370017 and parameters: {'criterion': 'entropy', 'max depth': 15, 'min samples split': 11, 'min samples leaf': 13, 'min weight fraction leaf': 0.4197474529747516, 'max_features': None, 'max leaf nodes': 74, 'min impurity decrease': 0.0007280221342007206, 'class weight': None, 'ccp alpha': 0.007403297967020938}. Best is trial 0 with value: 0.627773397688595.
[I 2024-09-17 18:38

Best trial: {'criterion': 'log_loss', 'max depth': 18, 'min samples split': 17, 'min samples leaf': 4, 'min weight fraction leaf': 0.0007842045594534525, 'max_features': 'sqrt', 'max leaf nodes': 81, 'min impurity decrease': 0.0002612096689857516, 'class weight': 'balanced_subsample', 'ccp alpha': 0.003790698945688128} Accuracy score: 0.7642866757687056


### Best result HP tuning for Random Forest <br>
Best trial: {'criterion': 'entropy', 'max depth': 37, 'min samples split': 12, 'min samples leaf': 2, 'min weight fraction leaf': 0.0011701375435263447, 'max_features': 'log2', 'max leaf nodes': 96, 'min impurity decrease': 0.0005376716733336186, 'class weight': 'balanced_subsample', 'ccp alpha': 0.003950027376071721} Accuracy score: 0.7822683306732304
##### after improved preprocessing <br>
Best trial: {'criterion': 'log_loss', 'max depth': 18, 'min samples split': 17, 'min samples leaf': 4, 'min weight fraction leaf': 0.0007842045594534525, 'max_features': 'sqrt', 'max leaf nodes': 81, 'min impurity decrease': 0.0002612096689857516, 'class weight': 'balanced_subsample', 'ccp alpha': 0.003790698945688128} Accuracy score: 0.7642866757687056

### Step 7a - Random Forest Retrain and Run Test

In [195]:
clf = RandomForestClassifier( 
        criterion  = 'log_loss',
        max_depth = 18,
        min_samples_split = 17,
        min_samples_leaf = 4,
        min_weight_fraction_leaf = 0.0007842045594534525,
        max_features = 'sqrt',
        max_leaf_nodes =81, 
        min_impurity_decrease =0.0002612096689857516, 
        class_weight='balanced_subsample', 
        ccp_alpha=0.003790698945688128)
 
clf.fit(X_train, y_train)     
 
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.7063197026022305


Best result with simple pre-processing = Accuracy: 0.7193308550185874