In [53]:
import pandas as pd
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import warnings

### Step 1 - Import Train and Test Data

In [4]:
# Load training set from CSV
train_data = pd.read_csv("train_data.csv")

# Load test set from CSV
test_data = pd.read_csv("test_data.csv")

### Step 2 - Create data from training and testing 


In [8]:
X_train = train_data["modified_clause_text"]
y_train = train_data["clause_type"]
X_test = test_data["modified_clause_text"]
y_test = test_data["clause_type"]

### Step 3 - Apply Tfidf Vectorization

In [9]:
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data (use only transform, no fit)
X_test_tfidf = vectorizer.transform(X_test)

### Step 4 - Train the KNN Classifier

In [45]:
knn = KNeighborsClassifier(n_neighbors=3)  # Use 3 nearest neighbors
knn.fit(X_train_tfidf, y_train)

### Step 5 - Make the predictions and calculate the accuracy

In [46]:
y_pred = knn.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


### Step 6 - Optimisation

In [55]:
def objective(trial):
    ## getting repetitive warnings when I run the code so switched these off
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
    
        # Define hyperparameters to tune
        max_df = trial.suggest_float('max_df', 0.5, 1.0)
        min_df = trial.suggest_float('min_df', 0.0, 0.49)
        max_features = trial.suggest_int('max_features', 10, 1000)
        stop_words = trial.suggest_categorical('stop_words', ['english', None])
        ngram_range = trial.suggest_categorical('ngram_range', [(1, 1), (1, 2), (1, 3), (1, 4)])
        n_neighbors = trial.suggest_int('n_neighbors', 1, 10)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])

        # Create pipeline
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, max_features=max_features, stop_words=stop_words)),
            ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)),
        ])
        # Fit to the data
        pipeline.fit(X_train, y_train)
        # Predict based on test
        y_pred = pipeline.predict(X_test)
        
        # Evaluate using Accuracy
        score = accuracy_score(y_test, y_pred)
    return score

# Create a study to optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

print("Best trial:", study.best_trial.params,"Accuracy score:", study.best_trial.value)

[I 2024-09-11 21:22:04,495] A new study created in memory with name: no-name-fe1be207-d3be-4a91-a4a4-dafe9e827ce3
[I 2024-09-11 21:22:05,190] Trial 0 finished with value: 0.8085501858736059 and parameters: {'max_df': 0.9007567392247289, 'min_df': 0.028691278116156845, 'max_features': 606, 'stop_words': None, 'ngram_range': (1, 1), 'n_neighbors': 7, 'weights': 'uniform'}. Best is trial 0 with value: 0.8085501858736059.
[I 2024-09-11 21:22:05,439] Trial 1 finished with value: 0.7955390334572491 and parameters: {'max_df': 0.5557493291954404, 'min_df': 0.09271665050709609, 'max_features': 726, 'stop_words': 'english', 'ngram_range': (1, 1), 'n_neighbors': 6, 'weights': 'distance'}. Best is trial 0 with value: 0.8085501858736059.
[I 2024-09-11 21:22:06,879] Trial 2 finished with value: 0.8197026022304833 and parameters: {'max_df': 0.9211676734804299, 'min_df': 0.0702224597395895, 'max_features': 948, 'stop_words': None, 'ngram_range': (1, 2), 'n_neighbors': 5, 'weights': 'uniform'}. Best is

Best trial: {'max_df': 0.8285053051701224, 'min_df': 0.08768243191518091, 'max_features': 311, 'stop_words': None, 'ngram_range': (1, 4), 'n_neighbors': 6, 'weights': 'distance'} Accuracy score: 0.8754646840148699
