In [114]:
import pandas as pd
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

### Step 1 - Import Train and Test Data

In [95]:
# Load training set from CSV
train_data = pd.read_csv("train_data.csv")
# Load test set from CSV
test_data = pd.read_csv("test_data.csv")

### Step 2 - Pre-process Text

In [96]:
# Apply simple pre-processing function to train data
train_data['train_tokens'] = train_data['modified_clause_text'].apply(utils.simple_preprocess)
# Apply simple pre-processing function to test data
test_data['test_tokens'] = test_data['modified_clause_text'].apply(utils.simple_preprocess)

### Step 3 - Tag the Documents

In [97]:
# Produce unique identifier for Doc2Vec to work on document using index as unique tag
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(train_data['train_tokens'])]

### Step 4 - Function to Train the Doc2Vec model and Extract Vectors for Train and Test sets

In [98]:
# Initialize the Doc2Vec model using PV-DM (which is like CBOW) 

def get_vectors(tagged_docs, vectorsize, windowsize, mincount, worker_no, dm, epochs_no):

    model = Doc2Vec(
    vector_size=vectorsize,   # Dimensionality of the vectors
    window=windowsize,          # Context window size
    min_count=mincount,       # Ignores all words with total frequency lower than this
    workers=worker_no,         # Number of parallel threads
    dm=1               # PV-DM (CBOW-like) mode
    )

    # Build the vocabulary from the tagged data
    model.build_vocab(tagged_data)
        
    # Train the model
    model.train(tagged_data, total_examples=model.corpus_count, epochs=epochs_no)

    X_train = [model.dv[i] for i in range(len(tagged_docs))]
    y_train = train_data["clause_type"]
    X_test = [model.infer_vector(doc) for doc in test_docs]
    y_test = test_data["clause_type"]

    return X_train, y_train, X_test, y_test


### Step 5 - Run functions and train Classifier


In [99]:
tagged_docs = train_data['train_tokens']    
test_docs = test_data['test_tokens']

X_train, y_train, X_test, y_test = get_vectors(tagged_docs,  vectorsize=100, windowsize=5, mincount=2, worker_no=4, dm=1, epochs_no=5)

clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)

### Step 6 - Predict labels based on classifier and calc accuracy

In [100]:
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.3252788104089219


### Step 7 - Decision Tree Hyperparameter Optimisation  

In [108]:
def objective(trial):
        
        # Decision Tree Hyperparameters      
        criterion  = trial.suggest_categorical('criterion', ['gini', 'entropy','log_loss'])
        splitter  = trial.suggest_categorical('splitter', ['best', 'random'])
        max_depth = trial.suggest_int('max depth',2,40)
        min_samples_split = trial.suggest_int('min samples split',2,20)
        min_samples_leaf = trial.suggest_int('min samples leaf',2,20)
        min_weight_fraction_leaf = trial.suggest_float('min weight fraction leaf',0.0, 0.5)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        max_leaf_nodes =trial.suggest_int('max leaf nodes',2,100) 
        min_impurity_decrease =trial.suggest_float('min impurity decrease',0.0, 0.001) 
        class_weight=trial.suggest_categorical('class weight', ['balanced', None]) 
        ccp_alpha=trial.suggest_float('ccp alpha',0.0,0.01)
                
        # Create pipeline
        pipeline = Pipeline([
            ('DTree Classifier', DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, ccp_alpha=ccp_alpha)),
        ])
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy',error_score='raise')
        score = cv_scores.mean()  # use mean accuracy as the score
    
        return score

# Create a study to optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=400)

print("Best trial:", study.best_trial.params,"Accuracy score:", study.best_trial.value)

[I 2024-09-17 14:38:03,509] A new study created in memory with name: no-name-d3285ca1-fec1-4af2-bc96-e785c5b8ad44
[I 2024-09-17 14:38:03,532] Trial 0 finished with value: 0.21954694921446838 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max depth': 31, 'min samples split': 2, 'min samples leaf': 15, 'min weight fraction leaf': 0.11884739395246419, 'max_features': 'sqrt', 'max leaf nodes': 85, 'min impurity decrease': 0.0009122175381007592, 'class weight': None, 'ccp alpha': 0.00869289308089368}. Best is trial 0 with value: 0.21954694921446838.
[I 2024-09-17 14:38:03,560] Trial 1 finished with value: 0.2605503528642578 and parameters: {'criterion': 'log_loss', 'splitter': 'best', 'max depth': 19, 'min samples split': 11, 'min samples leaf': 15, 'min weight fraction leaf': 0.24322688634510486, 'max_features': 'sqrt', 'max leaf nodes': 33, 'min impurity decrease': 0.0007841889186074116, 'class weight': None, 'ccp alpha': 0.008611280188577493}. Best is trial 1 with value: 0.

Best trial: {'criterion': 'entropy', 'splitter': 'best', 'max depth': 15, 'min samples split': 4, 'min samples leaf': 7, 'min weight fraction leaf': 0.0008017228309184037, 'max_features': None, 'max leaf nodes': 45, 'min impurity decrease': 0.000666189779162299, 'class weight': 'balanced', 'ccp alpha': 0.006046726905517552} Accuracy score: 0.6476857104397823


### Best result so far
Best trial: {'criterion': 'entropy', 'splitter': 'best', 'max depth': 15, 'min samples split': 4, 'min samples leaf': 7, 'min weight fraction leaf': 0.0008017228309184037, 'max_features': None, 'max leaf nodes': 45, 'min impurity decrease': 0.000666189779162299, 'class weight': 'balanced', 'ccp alpha': 0.006046726905517552} Accuracy score: 0.6476857104397823

### Step 7 - Decision Tree Retrain and Run Test


In [111]:
clf = DecisionTreeClassifier( 
        criterion  = 'entropy',
        splitter  = 'best',
        max_depth = 15,
        min_samples_split = 4,
        min_samples_leaf = 7,
        min_weight_fraction_leaf = 0.0008017228309184037,
        max_features = None,
        max_leaf_nodes =45, 
        min_impurity_decrease =0.000666189779162299, 
        class_weight='balanced', 
        ccp_alpha=0.006046726905517552)
 
clf.fit(X_train, y_train)     
 
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.5018587360594795


### Step 5a - Train Random Forest Classifier

In [115]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

### Step 6a - Random Forest Classifier predict labels based on classifier and calc accuracy

In [116]:
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.7211895910780669


### Step 7a - Random Forest Hyperparameter Optimisation

In [119]:
def objective_rf(trial):
        
        # Decision Tree Hyperparameters      
        criterion  = trial.suggest_categorical('criterion', ['gini', 'entropy','log_loss'])
        max_depth = trial.suggest_int('max depth',2,40)
        min_samples_split = trial.suggest_int('min samples split',2,20)
        min_samples_leaf = trial.suggest_int('min samples leaf',1,20)
        min_weight_fraction_leaf = trial.suggest_float('min weight fraction leaf',0.0, 0.5)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        max_leaf_nodes =trial.suggest_int('max leaf nodes',2,200) 
        min_impurity_decrease =trial.suggest_float('min impurity decrease',0.0, 0.001) 
        class_weight=trial.suggest_categorical('class weight', ['balanced', 'balanced_subsample', None]) 
        ccp_alpha=trial.suggest_float('ccp alpha',0.0,0.01)
                
        # Create pipeline
        pipeline = Pipeline([
               ('RandomForestClassifier', RandomForestClassifier(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, ccp_alpha=ccp_alpha)),
        ])
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy',error_score='raise')
        score = cv_scores.mean()  # use mean accuracy as the score
    
        return score

# Create a study to optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective_rf, n_trials=400)

print("Best trial:", study.best_trial.params,"Accuracy score:", study.best_trial.value)

[I 2024-09-17 16:27:36,982] A new study created in memory with name: no-name-ed4e25b8-4e94-4924-8d9c-7d706e9152ee
[I 2024-09-17 16:27:38,133] Trial 0 finished with value: 0.640834182643309 and parameters: {'criterion': 'log_loss', 'max depth': 11, 'min samples split': 16, 'min samples leaf': 15, 'min weight fraction leaf': 0.21932711494160156, 'max_features': 'sqrt', 'max leaf nodes': 123, 'min impurity decrease': 0.0005298729170925133, 'class weight': None, 'ccp alpha': 0.005407804355486871}. Best is trial 0 with value: 0.640834182643309.
[I 2024-09-17 16:27:38,781] Trial 1 finished with value: 0.6451685479683864 and parameters: {'criterion': 'gini', 'max depth': 18, 'min samples split': 20, 'min samples leaf': 16, 'min weight fraction leaf': 0.22649020537224362, 'max_features': 'log2', 'max leaf nodes': 161, 'min impurity decrease': 0.0003956740316100874, 'class weight': None, 'ccp alpha': 0.008059647601177252}. Best is trial 1 with value: 0.6451685479683864.
[I 2024-09-17 16:27:39,2

Best trial: {'criterion': 'entropy', 'max depth': 37, 'min samples split': 12, 'min samples leaf': 2, 'min weight fraction leaf': 0.0011701375435263447, 'max_features': 'log2', 'max leaf nodes': 96, 'min impurity decrease': 0.0005376716733336186, 'class weight': 'balanced_subsample', 'ccp alpha': 0.003950027376071721} Accuracy score: 0.7822683306732304


### Best result HP tuning for Decision Tree <br>
Best trial: {'criterion': 'entropy', 'max depth': 37, 'min samples split': 12, 'min samples leaf': 2, 'min weight fraction leaf': 0.0011701375435263447, 'max_features': 'log2', 'max leaf nodes': 96, 'min impurity decrease': 0.0005376716733336186, 'class weight': 'balanced_subsample', 'ccp alpha': 0.003950027376071721} Accuracy score: 0.7822683306732304


In [120]:
clf = RandomForestClassifier( 
        criterion  = 'entropy',
        max_depth = 37,
        min_samples_split = 12,
        min_samples_leaf = 2,
        min_weight_fraction_leaf = 0.0011701375435263447,
        max_features = 'log2',
        max_leaf_nodes =96, 
        min_impurity_decrease =0.0005376716733336186, 
        class_weight='balanced_subsample', 
        ccp_alpha=0.003950027376071721)
 
clf.fit(X_train, y_train)     
 
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.7193308550185874
