```markdown
# 📝 Text Classification with TF-IDF + SVM and MLflow

This notebook trains a **TF-IDF + SVM** classifier for text classification and logs results with **MLflow**.

Pipeline:
1. Load and preprocess dataset
2. TF-IDF vectorization
3. Train **SVM** with hyperparameter tuning
4. Calibrate classifier to output probabilities
5. Evaluate with CV, accuracy, and F1 score
6. Log results and model to **MLflow**
```

In [1]:
# 🔧 Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import nltk
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# 📥 Download NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/medhedimaaroufi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/medhedimaaroufi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/medhedimaaroufi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# ⚙️ Set MLflow Tracking URI
mlflow.set_tracking_uri("http://localhost:8080")

In [10]:
# 📂 Load and preprocess dataset
def load_data(dataset_path="../dataset/all_tickets_processed_improved_v3.csv"):
    df = pd.read_csv(dataset_path)
    if 'Document' not in df.columns or 'Topic_group' not in df.columns:
        raise ValueError("Dataset must contain 'Document' and 'Topic_group' columns")
    
    # Stopwords + Lemmatizer
    stop_words = set(nltk.corpus.stopwords.words('english')) | {'please', 'ticket', 'help'}
    lemmatizer = WordNetLemmatizer()
    
    def preprocess_text(text):
        text = re.sub(r'[^\w\s]', '', text.lower())  
        tokens = re.findall(r'\w+', text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    
    df['Document'] = df['Document'].apply(preprocess_text)
    return df['Document'].values, df['Topic_group'].values

X, y = load_data()
print(f"Dataset loaded. {len(X)} samples.")

Dataset loaded. 47837 samples.


In [11]:
# 🏋️ Train TF-IDF + SVM model
def train_tfidf_svm(X, y):
    # Vectorizer
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 3), 
        max_features=10000, 
        min_df=5, 
        sublinear_tf=True
    )
    X_tfidf = vectorizer.fit_transform(X)

    # Grid Search for best C
    param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l2']}
    svm = LinearSVC(class_weight='balanced', max_iter=2000)
    grid_search = GridSearchCV(svm, param_grid, cv=3, scoring='f1_weighted')
    grid_search.fit(X_tfidf, y)

    # Calibrate for probabilities
    best_svm = grid_search.best_estimator_
    calibrated_svm = CalibratedClassifierCV(best_svm, cv=3)
    calibrated_svm.fit(X_tfidf, y)

    # Cross-validation accuracy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(best_svm, X_tfidf, y, cv=cv, scoring='accuracy')

    # Predictions
    y_pred = calibrated_svm.predict(X_tfidf)
    probabilities = calibrated_svm.predict_proba(X_tfidf)

    return calibrated_svm, vectorizer, y, y_pred, probabilities, cv_scores.mean()

model, vectorizer, y_true, y_pred, probs, cv_accuracy = train_tfidf_svm(X, y)

In [12]:
# 📊 Evaluate model
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ F1 Score: {f1:.4f}")
print(f"✅ CV Accuracy: {cv_accuracy:.4f}")

✅ Accuracy: 0.8866
✅ F1 Score: 0.8867
✅ CV Accuracy: 0.8525


In [13]:
# 📈 Log with MLflow
with mlflow.start_run():
    mlflow.log_param("model", "TF-IDF + SVM")
    mlflow.log_param("ngram_range", "(1, 3)")
    mlflow.log_param("max_features", 10000)
    mlflow.log_param("min_df", 5)
    mlflow.log_param("sublinear_tf", True)
    mlflow.log_param("best_C", model.calibrated_classifiers_[0].estimator.C)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("cv_accuracy", cv_accuracy)

    # Example input
    input_example = np.array(["new hardware issue"])
    X_example_tfidf = vectorizer.transform(input_example)
    signature = infer_signature(X_example_tfidf, model.predict(X_example_tfidf))

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="tfidf_svm_model",
        signature=signature,
        input_example=input_example
    )

  "inputs": [
    "new hardware issue"
  ]
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Invalid input. could not convert string to float: 'new hardware issue'


🏃 View run amazing-horse-967 at: http://localhost:8080/#/experiments/0/runs/3fd3653a2e7143afa88f340538b1db5a
🧪 View experiment at: http://localhost:8080/#/experiments/0



# ✅ Results

- Accuracy, F1, and CV accuracy are printed above.
- All metrics, parameters, and the trained model are logged in **MLflow**.
- Open MLflow UI to explore:
```bash
mlflow ui --port 8080
````

