In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv("data/train.csv", header=None, names=["label", "text"])
df_dev = pd.read_csv("data/dev.csv", header=None, names=["label", "text"])
df_test = pd.read_csv("data/test.csv", header=None, names=["label", "text"])

df_train.head()  # Show first few rows

Unnamed: 0,label,text
0,1,"Nemmen li huwa baġit tajjeb, imma nies bħali t..."
1,0,Nippremjaw lil min ilu jikser il-liġi snin u j...
2,0,Xi cuc irrid jitfacca b'kritika ta' din ix-xorta.
3,0,"Ms. Metzola, ara jekk Ms. Mizzi jkollhiex ċans..."
4,1,Naf ċert li int qegħda hemm fejn il-Mulej tkom...


In [12]:
def predict_sentiment(sentence, model, vectorizer):
    sentence_transformed = vectorizer.transform([sentence])  # Convert sentence to TF-IDF features
    prediction = model.predict(sentence_transformed)  # Predict sentiment
    sentiment = "Positive 😊" if prediction[0] == 1 else "Negative 😠"
    return sentiment

In [9]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)

X_train = vectorizer.fit_transform(df_train["text"])
X_dev = vectorizer.transform(df_dev["text"])
X_test = vectorizer.transform(df_test["text"])

y_train = df_train["label"]
y_dev = df_dev["label"]
y_test = df_test["label"]

In [10]:
svm_model = SVC(kernel="linear")
svm_model.fit(X_train, y_train)

# Predict on dev set
y_pred_dev = svm_model.predict(X_dev)
accuracy = accuracy_score(y_dev, y_pred_dev)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8000


In [11]:
y_pred_test = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7368


In [19]:
test_sentence = "Il-film sabiħ ħafna!"
print(predict_sentiment(test_sentence, svm_model, vectorizer))  

Positive 😊
