In [47]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [48]:
df_train = pd.read_csv("data/train.csv", header=None, names=["label", "text"])
df_dev = pd.read_csv("data/dev.csv", header=None, names=["label", "text"])
df_test = pd.read_csv("data/test.csv", header=None, names=["label", "text"])

df_train.head()  # Show first few rows

Unnamed: 0,label,text
0,1,"Nemmen li huwa baġit tajjeb, imma nies bħali t..."
1,0,Nippremjaw lil min ilu jikser il-liġi snin u j...
2,0,Xi cuc irrid jitfacca b'kritika ta' din ix-xorta.
3,0,"Ms. Metzola, ara jekk Ms. Mizzi jkollhiex ċans..."
4,1,Naf ċert li int qegħda hemm fejn il-Mulej tkom...


In [49]:
# Function to Predict sentiment
def predict_sentiment(sentence, model, vectorizer):
    sentence_transformed = vectorizer.transform([sentence])  # Convert sentence to TF-IDF features
    prediction = model.predict(sentence_transformed)  # Predict sentiment
    sentiment = "Positive 😊" if prediction[0] == 1 else "Negative 😠"
    return sentiment

In [None]:
# Optimized TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    sublinear_tf=True
)

# Transform text into feature vectors
X_train = vectorizer.fit_transform(df_train["text"])
X_dev = vectorizer.transform(df_dev["text"])
X_test = vectorizer.transform(df_test["text"])

y_train, y_dev, y_test = df_train["label"], df_dev["label"], df_test["label"]

In [51]:
svm_model = SVC(kernel="linear", C=1.0, class_weight="balanced")  # Linear kernel is best for text
svm_model.fit(X_train, y_train)

#Predict on development set
y_pred_dev = svm_model.predict(X_dev)

dev_accuracy = accuracy_score(y_dev, y_pred_dev)
dev_precision = precision_score(y_dev, y_pred_dev)
dev_recall = recall_score(y_dev, y_pred_dev)
dev_f1 = f1_score(y_dev, y_pred_dev)

print(f"Validation Accuracy: {dev_accuracy:.4f}")
print(f"Validation Precision: {dev_precision:.4f}")
print(f"Validation Recall: {dev_recall:.4f}")
print(f"Validation F1 Score: {dev_f1:.4f}")


Validation Accuracy: 0.8118
Validation Precision: 0.8824
Validation Recall: 0.5172
Validation F1 Score: 0.6522


In [52]:
# Predict on test set
y_pred_test = svm_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.7193
Test Precision: 0.6765
Test Recall: 0.3833
Test F1 Score: 0.4894


In [53]:
# Showing misclassified examples for debugging
misclassified = np.where(y_pred_dev != y_dev)[0]
print("\nMisclassified Examples:")
for i in misclassified[:10]:  # Show first 10 misclassified examples
    print(f"Text: {df_dev.iloc[i]['text']}")
    print(f"True Label: {y_dev.iloc[i]}, Predicted: {y_pred_dev[i]}\n")


Misclassified Examples:
Text: Ser nimmisjawk għażiża nanna tal-poplu Malti u Għawdxi.
True Label: 1, Predicted: 0

Text: Punt importanti hu li l-website hi bil-Malti, xi ħaġa li aħna l-Maltin għandna nkunu kburin!
True Label: 1, Predicted: 0

Text: Ħajr kbir lill-Inews tal-ferħa li xerridtu fuqi dalgħodu!!!
True Label: 1, Predicted: 0

Text: Għamilli pjaċir, titkellimx f'ismi. Jien nivvota MLP imma il-kaċċa ma ddoqlix u se nivvota biex tieqaf imqar fir-rebbiegħa. Almenu nkunu nistgħu ingawdu l-kampanja u l-għasafar aktar fit-tul.
True Label: 1, Predicted: 0

Text: Ħadd m'għandu jimponi fuq l-ieħor, iżda mill-banda l-oħra  jekk int qiegħed f'pajjiż li mhuwiex tiegħek għandek tirrispetta l-liġijiet u l-kultura tiegħu, u xorta żżomm u jkollok id-dritt li tipprattika u tħaddan dak li trid.
True Label: 1, Predicted: 0

Text: Tallinja Card b'xejn għal dawk bejn 16 u 20 sena. #maltabudget2018
True Label: 1, Predicted: 0

Text: Jekk għandhom ikunu rranġati l-inġustizzji, għandhom ikunu rranġa

In [55]:
test_sentence = "Veru kien sabiħ"
print(predict_sentiment(test_sentence, svm_model, vectorizer))  

Positive 😊
