In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check the file encoding

In [18]:
!pip install chardet



In [19]:
import chardet

with open('/content/Q2 Sentiment Analysis Dataset.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

print(f"File encoding: {encoding}")

File encoding: ISO-8859-1


# Read the file with the correct encoding

In [20]:
df = pd.read_csv('/content/Q2 Sentiment Analysis Dataset.csv', encoding=encoding)
df.head(10)

Unnamed: 0,id,sentiment,date,text,Unnamed: 4,Unnamed: 5
0,623495523,1,Mon Dec 01 20:46:01 +0000 2014,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,,
1,623495527,1,Mon Dec 01 21:09:50 +0000 2014,@apple Contact sync between Yosemite and iOS8 ...,,
2,623495529,1,Mon Dec 01 21:35:14 +0000 2014,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,,
3,623495536,1,Mon Dec 01 23:55:55 +0000 2014,"@Apple, For the love of GAWD, CENTER the '1'on...",,
4,623495537,1,Tue Dec 02 00:06:05 +0000 2014,i get the storage almost full notification lit...,,
5,623495538,1,Tue Dec 02 00:14:25 +0000 2014,I had to do made the #switch from iPhone 6 to ...,,
6,623495539,1,Tue Dec 02 00:15:11 +0000 2014,@ me RT @101Baemations: Can't stand those ppl ...,,
7,623495552,1,Tue Dec 02 00:24:47 +0000 2014,That flash crash really screwed with a lot of ...,,
8,623495554,1,Tue Dec 02 00:27:23 +0000 2014,Nigga update yall headphones @Apple,,
9,623495556,1,Tue Dec 02 00:28:38 +0000 2014,RT @thehill: Justice Department cites 18th cen...,,


# Drop Unnamed Columns

In [21]:
df = df.dropna(axis=1, how='all')
df.head(10)

Unnamed: 0,id,sentiment,date,text
0,623495523,1,Mon Dec 01 20:46:01 +0000 2014,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
1,623495527,1,Mon Dec 01 21:09:50 +0000 2014,@apple Contact sync between Yosemite and iOS8 ...
2,623495529,1,Mon Dec 01 21:35:14 +0000 2014,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...
3,623495536,1,Mon Dec 01 23:55:55 +0000 2014,"@Apple, For the love of GAWD, CENTER the '1'on..."
4,623495537,1,Tue Dec 02 00:06:05 +0000 2014,i get the storage almost full notification lit...
5,623495538,1,Tue Dec 02 00:14:25 +0000 2014,I had to do made the #switch from iPhone 6 to ...
6,623495539,1,Tue Dec 02 00:15:11 +0000 2014,@ me RT @101Baemations: Can't stand those ppl ...
7,623495552,1,Tue Dec 02 00:24:47 +0000 2014,That flash crash really screwed with a lot of ...
8,623495554,1,Tue Dec 02 00:27:23 +0000 2014,Nigga update yall headphones @Apple
9,623495556,1,Tue Dec 02 00:28:38 +0000 2014,RT @thehill: Justice Department cites 18th cen...


In [22]:
X = df['text']  # Review text
y = df['sentiment']  # Sentiment labels

# Split the dataset into training and testing sets

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to train

In [24]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Perceptron": Perceptron()
}

In [25]:
def evaluate_models(X_train, X_test, y_train, y_test, feature_name):
    res = []

    # Iterate through models using a while loop
    model_names = list(models.keys())  # Get list of model names
    i = 0  # Initialize index for the while loop
    while i < len(model_names):
        name = model_names[i]
        model = models[name]

        # Fit the model to the training data
        model.fit(X_train, y_train)

        # Make predictions on the test data
        predictions = model.predict(X_test)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, predictions)
        precision, recall, fscore, _ = precision_recall_fscore_support(y_test, predictions, average='macro')
        precision_micro, recall_micro, fscore_micro, _ = precision_recall_fscore_support(y_test, predictions, average='micro')

        # Append res to the list
        res.append({
            'Model': name,
            'Feature Extraction': feature_name,
            'Accuracy': accuracy,
            'Precision (Macro)': precision,
            'Recall (Macro)': recall,
            'F1-Score (Macro)': fscore,
            'Precision (Micro)': precision_micro,
            'Recall (Micro)': recall_micro,
            'F1-Score (Micro)': fscore_micro
        })

        i += 1

    return pd.DataFrame(res)

In [26]:
def transform_and_evaluate(X_train, X_test, y_train, y_test):
    # Define the vectorizers
    vectorizers = {
        "BoW Raw Counts": CountVectorizer(),
        "BoW TfIDF": TfidfVectorizer(),
        "N-grams": CountVectorizer(ngram_range=(1, 3))
    }

    # Create an empty DataFrame to store results
    overall_result = pd.DataFrame()

    # Loop through the vectorizers using a while loop
    v = list(vectorizers.keys())
    i = 0
    while i < len(v):
        vect_name = v[i]
        vectorizer = vectorizers[vect_name]

        # Transform the data using the current vectorizer
        X_train_transformed = vectorizer.fit_transform(X_train)
        X_test_transformed = vectorizer.transform(X_test)

        # Evaluate models and get results
        results = evaluate_models(X_train_transformed, X_test_transformed, y_train, y_test, vect_name)

        # Concatenate results to the overall DataFrame
        overall_result = pd.concat([overall_result, results], ignore_index=True)

        i += 1

    return overall_result

In [27]:
final_results = transform_and_evaluate(X_train, X_test, y_train, y_test)
final_results

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,Feature Extraction,Accuracy,Precision (Macro),Recall (Macro),F1-Score (Macro),Precision (Micro),Recall (Micro),F1-Score (Micro)
0,Naive Bayes,BoW Raw Counts,0.726221,0.525596,0.429674,0.42585,0.726221,0.726221,0.726221
1,Logistic Regression,BoW Raw Counts,0.748072,0.509881,0.451867,0.462568,0.748072,0.748072,0.748072
2,Random Forest,BoW Raw Counts,0.745501,0.555182,0.442841,0.455846,0.745501,0.745501,0.745501
3,SVM,BoW Raw Counts,0.735219,0.586433,0.418947,0.42197,0.735219,0.735219,0.735219
4,Perceptron,BoW Raw Counts,0.733933,0.545199,0.463601,0.485623,0.733933,0.733933,0.733933
5,Naive Bayes,BoW TfIDF,0.751928,0.632412,0.423037,0.422528,0.751928,0.751928,0.751928
6,Logistic Regression,BoW TfIDF,0.74036,0.544822,0.434823,0.44473,0.74036,0.74036,0.74036
7,Random Forest,BoW TfIDF,0.733933,0.551638,0.426947,0.431757,0.733933,0.733933,0.733933
8,SVM,BoW TfIDF,0.750643,0.587932,0.440222,0.451234,0.750643,0.750643,0.750643
9,Perceptron,BoW TfIDF,0.700514,0.506361,0.473202,0.480116,0.700514,0.700514,0.700514


In [28]:
final_results.to_csv('final_results.csv', index=False)