# Tweets Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

tweet_df = pd.read_csv('/content/drive/MyDrive/Tweets.csv', encoding="ISO-8859-1")

print(tweet_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                 tweet_id airline_sentiment  airline_sentiment_confidence  \
0      570306133677760513           neutral                        1.0000   
1      570301130888122368          positive                        0.3486   
2      570301083672813571           neutral                        0.6837   
3      570301031407624196          negative                        1.0000   
4      570300817074462722          negative                        1.0000   
...                   ...               ...                           ...   
14635  569587686496825344          positive                        0.3487   
14636  569587371693355008          negative                        1.0000   
14637  569587242672398336           neutral                        1.0000   
14638  569587188687634433          negative                        1.0000   
14639  56958714049086668

# Naive Bayes Classifier


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('/content/drive/MyDrive/Tweets.csv', encoding="ISO-8859-1")

X = df['text']
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.99      0.84      1889
     neutral       0.76      0.24      0.37       580
    positive       0.91      0.32      0.48       459

    accuracy                           0.74      2928
   macro avg       0.80      0.52      0.56      2928
weighted avg       0.76      0.74      0.69      2928

Confusion Matrix:
[[1869   17    3]
 [ 427  142   11]
 [ 283   27  149]]


# Random Forest Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('/content/drive/MyDrive/Tweets.csv', encoding="ISO-8859-1")

X = df['text']
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Accuracy: 0.77
Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.96      0.86      1889
     neutral       0.67      0.40      0.50       580
    positive       0.81      0.46      0.59       459

    accuracy                           0.77      2928
   macro avg       0.76      0.61      0.65      2928
weighted avg       0.76      0.77      0.75      2928

Confusion Matrix:
[[1815   56   18]
 [ 317  231   32]
 [ 190   56  213]]


# KNN Model Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('/content/drive/MyDrive/Tweets.csv', encoding="ISO-8859-1")

X = df['text']
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_tfidf, y_train)

y_pred = knn_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.72
Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.83      0.82      1889
     neutral       0.46      0.48      0.47       580
    positive       0.66      0.58      0.62       459

    accuracy                           0.72      2928
   macro avg       0.64      0.63      0.64      2928
weighted avg       0.72      0.72      0.72      2928

Confusion Matrix:
[[1559  251   79]
 [ 243  281   56]
 [ 112   83  264]]


# Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('/content/drive/MyDrive/Tweets.csv', encoding="ISO-8859-1")

X = df['text']
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, y_train)

y_pred = logreg_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.94      0.88      1889
     neutral       0.67      0.53      0.59       580
    positive       0.80      0.62      0.70       459

    accuracy                           0.81      2928
   macro avg       0.77      0.70      0.72      2928
weighted avg       0.80      0.81      0.80      2928

Confusion Matrix:
[[1767   92   30]
 [ 231  306   43]
 [ 116   57  286]]
