In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv("Q2Dataset.csv",encoding='latin1')


data.dropna(inplace=True,axis=1)

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], test_size=0.2, random_state=42)


In [2]:
def train_and_evaluate_classifier(classifier, X_train, y_train, X_test, y_test, vectorizer):
    X_train_features = vectorizer.fit_transform(X_train)
    X_test_features = vectorizer.transform(X_test)

    classifier.fit(X_train_features, y_train)

    y_pred = classifier.predict(X_test_features)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    return accuracy, precision, recall, f1

def evaluate_classifiers(classifiers, X_train, y_train, X_test, y_test, vectorizers):
    results = []
    for classifier in classifiers:
        classifier_results = []
        for vectorizer in vectorizers:
            accuracy, precision, recall, f1 = train_and_evaluate_classifier(classifier, X_train, y_train, X_test, y_test, vectorizer)
            classifier_results.append((accuracy, precision, recall, f1))
        results.append(classifier_results)
    return results


In [3]:
classifiers = [
    MultinomialNB(),
    LogisticRegression(max_iter=1000),
    RandomForestClassifier(),
    SVC(),
    Perceptron()
]

vectorizers = [
    CountVectorizer(),
    CountVectorizer(ngram_range=(1, 2)),
    CountVectorizer(ngram_range=(1, 3)),
    TfidfVectorizer()
]


In [6]:
results = evaluate_classifiers(classifiers, X_train, y_train, X_test, y_test, vectorizers)

print("Classifier\t\tVectorizer\t\tAccuracy\tPrecision\tRecall\t\tF1 Score")
print("-------------------------------------------------------------------------")
for i, classifier in enumerate(classifiers):
    for j, vectorizer in enumerate(vectorizers):
        accuracy, precision, recall, f1 = results[i][j]
        print(f"{classifier.__class__.__name__}|\t{vectorizer.__class__.__name__}|\t\t{accuracy:.4f}|\t\t{precision:.4f}|\t\t{recall:.4f}|\t\t{f1:.4f}")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier		Vectorizer		Accuracy	Precision	Recall		F1 Score
-------------------------------------------------------------------------
MultinomialNB|	CountVectorizer|		0.7262|		0.5256|		0.4297|		0.4258
MultinomialNB|	CountVectorizer|		0.7416|		0.5663|		0.4345|		0.4338
MultinomialNB|	CountVectorizer|		0.7429|		0.5229|		0.4347|		0.4337
MultinomialNB|	TfidfVectorizer|		0.7519|		0.6324|		0.4230|		0.4225
LogisticRegression|	CountVectorizer|		0.7481|		0.5099|		0.4519|		0.4626
LogisticRegression|	CountVectorizer|		0.7468|		0.5285|		0.4484|		0.4599
LogisticRegression|	CountVectorizer|		0.7391|		0.5335|		0.4403|		0.4517
LogisticRegression|	TfidfVectorizer|		0.7404|		0.5448|		0.4348|		0.4447
RandomForestClassifier|	CountVectorizer|		0.7494|		0.5853|		0.4404|		0.4505
RandomForestClassifier|	CountVectorizer|		0.7378|		0.5650|		0.4279|		0.4381
RandomForestClassifier|	CountVectorizer|		0.7314|		0.5686|		0.4258|		0.4316
RandomForestClassifier|	TfidfVectorizer|		0.7416|		0.5410|		0.4369|		0.4447
SVC|	C