In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
pip install scikit-learn pandas numpy nltk



In [None]:
import google.auth
import gspread
import pandas as pd

creds, _ = google.auth.default(scopes=['https://www.googleapis.com/auth/drive'])
gc = gspread.authorize(creds)

sh = gc.open("Copy of sample_for_manual_label")

worksheet = sh.sheet1

records = worksheet.get_all_records()

df = pd.DataFrame(records)

df_selected = df[['clean_text', 'Manual_Category']]

print(df_selected.head())


                                          clean_text     Manual_Category
0  what are my chances of getting canadian studen...      Student Permit
1  bc pnp international post graduate as a phd st...                 PNP
2  case specific inquiry submitted a case specifi...  Family Sponsorship
3  would this year be counted in family sponsorsh...  Family Sponsorship
4  can i use lmia i received to get cwp in a pr a...       Express Entry


In [None]:
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# =======================
# 1. Data Preprocessing
# =======================

def clean_text(text):
    """
    Clean the input text by:
    - Removing HTML tags
    - Removing punctuation and digits
    - Converting text to lowercase
    - Removing extra whitespace
    """
    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', '', text)
    # Remove punctuation and digits
    text = re.sub(f"[{re.escape(string.punctuation + string.digits)}]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Assume df is already defined with columns "clean_text" and "Manual_Category"
# Apply text cleaning to the 'clean_text' column
df['clean_text'] = df['clean_text'].astype(str).apply(clean_text)

# Print the original class distribution
print("[INFO] Original class distribution:")
print(df['Manual_Category'].value_counts())

# Check for rare classes (i.e. those with fewer than 2 members)
class_counts = df['Manual_Category'].value_counts()
rare_classes = class_counts[class_counts < 2].index
if len(rare_classes) > 0:
    print("[INFO] Dropping classes with fewer than 2 samples:", list(rare_classes))
    df = df[~df['Manual_Category'].isin(rare_classes)]
    print("[INFO] Updated class distribution:")
    print(df['Manual_Category'].value_counts())

# Split the data into features and labels
X = df['clean_text']
y = df['Manual_Category']

# Split data into training and test sets (using stratification to maintain label proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"[INFO] Training samples: {len(X_train)}")
print(f"[INFO] Test samples: {len(X_test)}")

# =======================
# 2. Model Training
# =======================

# 2.1 Pipeline for Logistic Regression
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),   # Convert text into TF-IDF features
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

print("\n[Training] Logistic Regression model...")
pipeline_lr.fit(X_train, y_train)
print("[Predicting] Using Logistic Regression on test set...")
y_pred_lr = pipeline_lr.predict(X_test)

# 2.2 Pipeline for SVM
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(random_state=42))
])

print("\n[Training] SVM model...")
pipeline_svm.fit(X_train, y_train)
print("[Predicting] Using SVM on test set...")
y_pred_svm = pipeline_svm.predict(X_test)

# =======================
# 5. Hyperparameter Tuning: GridSearchCV tuning Logistic Regression
# =======================

# Define the parameter grid:
# - tfidf__ngram_range: exploring unigrams and bigrams
# - tfidf__max_df: maximum document frequency threshold for the vocabulary
# - clf__C: Regularization parameter for Logistic Regression
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.75, 0.85, 1.0],
    'clf__C': [0.1, 1, 10],
}

grid_search = GridSearchCV(
    pipeline_lr,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=1
)

print("\n[Hyperparameter Tuning] Running GridSearchCV for tuning...")
grid_search.fit(X_train, y_train)

print("\n[Results] Best parameters:", grid_search.best_params_)
print("[Results] Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# Retrieve the best model from GridSearchCV
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

print("\n============================")
print("After Tuning Logistic Regression - Test Set Evaluation")
print("============================")
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print("Accuracy:", accuracy_tuned)
print("Classification Report:")
print(classification_report(y_test, y_pred_tuned))

# =======================
# 6. Model Evaluation and Validation: K-Fold Cross Validation on Logistic Regression
# =======================

# Set up 5-fold cross validation with shuffling and a fixed random state for reproducibility
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define evaluation metrics: accuracy, precision, recall, F1 score (macro-averaged for multi-class)
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_macro',
    'recall': 'recall_macro',
    'f1': 'f1_macro'
}

print("\n[K-Fold Cross Validation] Evaluating model generalizability using 5-fold cross validation...")

# Apply cross-validation on the entire dataset to assess the overall model performance
cv_results = cross_validate(best_model, X, y, cv=kf, scoring=scoring, n_jobs=-1)

# Print the mean and standard deviation for each metric
print("\n[Cross Validation Results]")
print("Accuracy: {:.4f} ± {:.4f}".format(cv_results['test_accuracy'].mean(), cv_results['test_accuracy'].std()))
print("Precision: {:.4f} ± {:.4f}".format(cv_results['test_precision'].mean(), cv_results['test_precision'].std()))
print("Recall: {:.4f} ± {:.4f}".format(cv_results['test_recall'].mean(), cv_results['test_recall'].std()))
print("F1 Score: {:.4f} ± {:.4f}".format(cv_results['test_f1'].mean(), cv_results['test_f1'].std()))

# Evaluate the tuned model on the test set separately
print("\n============================")
print("Tuned Logistic Regression - Test Set Evaluation")
print("============================")
y_pred_best = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_best)
print("Accuracy:", test_accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred_best))

# =======================
# 3. Evaluation & Reporting
# =======================

# Evaluation for Logistic Regression (before tuning)
print("\n============================")
print("Logistic Regression - Test Set Evaluation")
print("============================")
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", accuracy_lr)
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

# Evaluation for SVM
print("\n============================")
print("SVM - Test Set Evaluation")
print("============================")
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy_svm)
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))


[INFO] Original class distribution:
Manual_Category
Other                 362
Express Entry         212
Family Sponsorship    133
Student Permit        107
Work Permit           104
PNP                    94
PGWP                   81
Refugee                46
Name: count, dtype: int64
[INFO] Training samples: 911
[INFO] Test samples: 228

[Training] Logistic Regression model...
[Predicting] Using Logistic Regression on test set...

[Training] SVM model...
[Predicting] Using SVM on test set...

[Hyperparameter Tuning] Running GridSearchCV for tuning...
Fitting 5 folds for each of 18 candidates, totalling 90 fits

[Results] Best parameters: {'clf__C': 10, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 1)}
[Results] Best cross-validation accuracy: 0.6882

After Tuning Logistic Regression - Test Set Evaluation
Accuracy: 0.706140350877193
Classification Report:
                    precision    recall  f1-score   support

     Express Entry       0.56      0.64      0.60        42
Family S