In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("sample_data.csv")

In [3]:
df['label'].value_counts()

label
ft     11226
pkg     9617
ct      5061
mr      5016
ch      3688
cnc     2587
Name: count, dtype: int64

In [4]:
def contains_non_german(text):
    pattern = r'[^a-zA-ZäöüÄÖÜß\s]'  
    return bool(pd.Series(text).str.contains(pattern, regex=True).any())

df['contains_non_german'] = df['text'].apply(contains_non_german)
df_filtered = df[~df['contains_non_german']]

print(df_filtered)

                                text label  contains_non_german
0                      zucker fabrik    ft                False
1       Lebensmittel kommssionierung    ft                False
2                    geländer biegen    mr                False
3       gebäudeausrüstung technische    ct                False
4              kürbiskernöl softgels    ft                False
...                              ...   ...                  ...
37290      spirituosen dienstleister    ft                False
37291         mini hydraulikzylinder    ct                False
37292  blockbodenbeutel verpackungen   pkg                False
37293            Drehteile verpacken    mr                False
37294                   bagger tanks    ct                False

[33512 rows x 3 columns]


In [5]:
df_filtered.tail(10)

Unnamed: 0,text,label,contains_non_german
37284,Zip Beutel,pkg,False
37285,wasserstoff druckregelventile,ch,False
37286,big bag entleeranlagen,pkg,False
37288,gas compressors,ch,False
37289,Werkzeugschleifereien Heckerstieg,mr,False
37290,spirituosen dienstleister,ft,False
37291,mini hydraulikzylinder,ct,False
37292,blockbodenbeutel verpackungen,pkg,False
37293,Drehteile verpacken,mr,False
37294,bagger tanks,ct,False


In [6]:
df_filtered = df_filtered.dropna()

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

#lower casing the data
df_filtered['text'] = df_filtered['text'].str.lower()

#Converting text to vectors
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df_filtered['text'])

vectorizer_filename = "tfidf_vectorizer.pkl"
joblib.dump(vectorizer, vectorizer_filename)

#Encoding the labels to neumeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_filtered['label'])
encoder_filename = "label_encoder.pkl"
joblib.dump(label_encoder, encoder_filename)

#Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import joblib

# Defining different models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(decision_function_shape='ovo'),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Training and evaluating each model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {name}: {accuracy}")
    print(classification_report(y_test, y_pred))
    print("="*50)

    # Save the trained model using joblib
    model_filename = f"{name}_model.pkl"
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")

Training Logistic Regression...
Accuracy for Logistic Regression: 0.83828136655229
              precision    recall  f1-score   support

           0       0.95      0.74      0.83       665
           1       0.84      0.73      0.78       397
           2       0.93      0.79      0.85       890
           3       0.76      0.95      0.85      2061
           4       0.84      0.75      0.80       882
           5       0.87      0.84      0.86      1808

    accuracy                           0.84      6703
   macro avg       0.87      0.80      0.83      6703
weighted avg       0.85      0.84      0.84      6703

Model saved as Logistic Regression_model.pkl
Training SVM...
Accuracy for SVM: 0.8488736386692526
              precision    recall  f1-score   support

           0       0.95      0.74      0.84       665
           1       0.88      0.80      0.83       397
           2       0.93      0.80      0.86       890
           3       0.78      0.94      0.85      2061
     

In [27]:
import joblib

# Load your trained TF-IDF vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Load your trained model
model = joblib.load('SVM_model.pkl')

# Get input text from the user
text = input("Enter text: ")

# Transform the input text using the fitted vectorizer
new_text_features = vectorizer.transform([text])

# Make predictions
y_pred = model.predict(new_text_features)
encoder = joblib.load("label_encoder.pkl")
y_pred_text = encoder.inverse_transform(y_pred)
y_pred_text[0]

'ft'

In [15]:
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC

# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.783 total time=  22.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.785 total time=  21.5s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.785 total time=  21.6s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.769 total time=  21.3s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.787 total time=  21.6s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.586 total time=  26.5s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.589 total time=  26.6s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.588 total time=  26.6s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.573 total time=  26.3s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.581 total time=  26.5s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.312 total time=  28.5s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [19]:
print(grid.best_params_) 
print(grid.best_estimator_) 

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=10, gamma=1)


In [20]:
from sklearn.metrics import accuracy_score, classification_report
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions)) 

              precision    recall  f1-score   support

           0       0.95      0.75      0.84       665
           1       0.84      0.82      0.83       397
           2       0.92      0.80      0.86       890
           3       0.79      0.94      0.85      2061
           4       0.86      0.76      0.80       882
           5       0.88      0.87      0.87      1808

    accuracy                           0.85      6703
   macro avg       0.87      0.82      0.84      6703
weighted avg       0.86      0.85      0.85      6703



In [21]:
best_svm_model = grid.best_estimator_
joblib.dump(best_svm_model, 'best_svm_model.pkl')

['best_svm_model.pkl']