In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import joblib

# Pipeline

In [None]:
df = pd.read_csv("./data/data_unnormalized.csv")

X = df["sentence"]
Y = df["polarity"]


xtrain, xtest, ytrain, ytest = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=67
)

vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    max_features=7000,
    min_df=2
)

xtrainVec = vectorizer.fit_transform(xtrain)
xtestVec = vectorizer.transform(xtest)

svm = LinearSVC(class_weight='balanced')
svm.fit(xtrainVec, ytrain)

print(classification_report(ytest, svm.predict(xtestVec)))

              precision    recall  f1-score   support

    negative       0.68      0.58      0.62       311
     neutral       0.73      0.78      0.76       254
    positive       0.77      0.81      0.79       556

    accuracy                           0.74      1121
   macro avg       0.73      0.72      0.72      1121
weighted avg       0.74      0.74      0.74      1121



# Test on custome txt

In [38]:
testTXT = "کارت گرافیکی که خریدم واقعا حرف نداشت" 

resVEC = vectorizer.transform([testTXT])
polarity = svm.predict(resVEC)
print(polarity)

['positive']


# Export vocab

In [None]:
with open("vocab.txt", mode="w", encoding='utf-8') as f:
        for i, word in enumerate(vectorizer.get_feature_names_out()):
                f.write(f"{i}: {word}\n")

# Save Model

In [40]:
joblib.dump(svm,        "svm_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']