In [66]:
import pandas as pd
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import warnings
from sklearn.model_selection import cross_val_score

### Step 1 - Import Train and Test Data

In [67]:
# Load training set from CSV
train_data = pd.read_csv("train_data.csv")

# Load test set from CSV
test_data = pd.read_csv("test_data.csv")

### Step 2 - Create data from training and testing 


In [68]:
X_train = train_data["preprocessed_text"]
y_train = train_data["clause_type"]
X_test = test_data["preprocessed_text"]
y_test = test_data["clause_type"]

### Step 3 - Apply Tfidf Vectorization

In [69]:
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data (use only transform, no fit)
X_test_tfidf = vectorizer.transform(X_test)

### Step 4 - Train the KNN Classifier

In [70]:
knn = KNeighborsClassifier(n_neighbors=3)  # Use 3 nearest neighbors
knn.fit(X_train_tfidf, y_train)

### Step 5 - Make the predictions and calculate the accuracy

In [72]:
y_pred = knn.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


### Step 6 - Hyperparameter Optimisation

In [73]:
def objective(trial):
    ## getting repetitive warnings on running the code so switched these off
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
    
        # Define hyperparameters to tune
        max_df = trial.suggest_float('max_df', 0.5, 1.0)
        min_df = trial.suggest_float('min_df', 0.0, 0.49)
        max_features = trial.suggest_int('max_features', 10, 1000)
        stop_words = trial.suggest_categorical('stop_words', ['english', None])
        ngram_range = trial.suggest_categorical('ngram_range', [(1, 1), (1, 2), (1, 3), (1, 4)])
        n_neighbors = trial.suggest_int('n_neighbors', 1, 10)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])

        # Create pipeline
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, max_features=max_features, stop_words=stop_words)),
            ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)),
        ])
        
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
        score = cv_scores.mean()  # use mean accuracy as the score
    return score

# Create a study to optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

print("Best trial:", study.best_trial.params,"Accuracy score:", study.best_trial.value)

[I 2024-09-18 15:31:33,419] A new study created in memory with name: no-name-b5e0949d-1b64-4bad-9577-d5025b34d109
[I 2024-09-18 15:31:36,072] Trial 0 finished with value: 0.8182470242101418 and parameters: {'max_df': 0.5483766664839926, 'min_df': 0.042609933966360314, 'max_features': 456, 'stop_words': None, 'ngram_range': (1, 3), 'n_neighbors': 8, 'weights': 'uniform'}. Best is trial 0 with value: 0.8182470242101418.
[I 2024-09-18 15:31:36,990] Trial 1 finished with value: 0.8176143684018229 and parameters: {'max_df': 0.6553632848273134, 'min_df': 0.13731473183122034, 'max_features': 56, 'stop_words': None, 'ngram_range': (1, 2), 'n_neighbors': 6, 'weights': 'distance'}. Best is trial 0 with value: 0.8182470242101418.
[I 2024-09-18 15:31:39,629] Trial 2 finished with value: 0.8244370517085553 and parameters: {'max_df': 0.5154964992968765, 'min_df': 0.049101657251867124, 'max_features': 491, 'stop_words': 'english', 'ngram_range': (1, 3), 'n_neighbors': 5, 'weights': 'distance'}. Best 

Best trial: {'max_df': 0.693372321103527, 'min_df': 0.08413292078757699, 'max_features': 332, 'stop_words': None, 'ngram_range': (1, 2), 'n_neighbors': 7, 'weights': 'distance'} Accuracy score: 0.8467684556660193


#### Best with Simple preprocessing
Best trial: {'max_df': 0.7859256640614342, 'min_df': 0.04713111258556, 'max_features': 820, 'stop_words': None, 'ngram_range': (1, 2), 'n_neighbors': 10, 'weights': 'distance'} Accuracy score: 0.8393381151087438

#### Best with Improved preprocessing<br>
Best trial: {'max_df': 0.693372321103527, 'min_df': 0.08413292078757699, 'max_features': 332, 'stop_words': None, 'ngram_range': (1, 2), 'n_neighbors': 7, 'weights': 'distance'} Accuracy score: 0.8467684556660193


### Step 7 - Revectorise, Retrain and predict the test set

In [1]:
vectorizer = TfidfVectorizer(max_df=0.693372321103527, min_df=0.08413292078757699, ngram_range=(1,2), max_features=332, stop_words=None)

# Fit and transform the training data
X_train_tfidf2 = vectorizer.fit_transform(X_train)

# Transform the test data (use only transform, no fit)
X_test_tfidf2 = vectorizer.transform(X_test)

# Retrain KNN
knn = KNeighborsClassifier(n_neighbors=7, weights = 'distance')  
knn.fit(X_train_tfidf2, y_train)

y_pred2 = knn.predict(X_test_tfidf2)

accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy2:.3f}")


NameError: name 'TfidfVectorizer' is not defined

Best with simple preprocessing = 0.83