In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier

In [5]:
# Load the dataset
df = pd.read_csv('Tweets.csv')

In [6]:
# Data preprocessing
X = df['text']
y = df['airline_sentiment']


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Convert text data to numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [9]:
# Ensemble model with Random Forest and Gradient Boosting
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

In [10]:
# Create a voting ensemble classifier
ensemble_classifier = VotingClassifier(estimators=[('rf', rf_classifier), ('gb', gb_classifier)], voting='hard')

In [11]:
# Fit the ensemble model
ensemble_classifier.fit(X_train_tfidf, y_train)

In [12]:
# Predict sentiment on the test set
y_pred = ensemble_classifier.predict(X_test_tfidf)

In [13]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)


In [14]:
# Print the accuracy
print(f'Ensemble Accuracy: {accuracy:.2f}')

Ensemble Accuracy: 0.75


In [15]:
# Classification report and confusion matrix
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

              precision    recall  f1-score   support

    negative       0.76      0.97      0.85      1889
     neutral       0.66      0.30      0.41       580
    positive       0.82      0.43      0.57       459

    accuracy                           0.75      2928
   macro avg       0.75      0.57      0.61      2928
weighted avg       0.75      0.75      0.72      2928

[[1837   35   17]
 [ 383  172   25]
 [ 209   52  198]]
