<a href="https://colab.research.google.com/github/Nadeesha-D-Shalom/AI-ML-Engineer-Task-1-Task-2/blob/main/AI_ML_Engineer_Task_2_Airline_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd

In [22]:
df = pd.read_csv("Tweets.csv")

In [23]:
# Keep only required columns
df = df[["airline_sentiment", "text"]]

Display basic info


In [24]:
print(df.shape)
print(df.head())

(14640, 2)
  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...


#Preprocess Text (Cleaning & Stemming)

In [25]:
import nltk
import string
import re

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [26]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)

    # Tokenize
    text = nltk.word_tokenize(text)

    # Remove stopwords
    y = []
    for i in text:
        if i not in stopwords.words('english'):
            y.append(i)

    # Stemming
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)


In [27]:
df["text_cleaned"] = df["text"].apply(clean_text)

In [28]:
df.head()

Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,@ virginamerica @ dhepburn said .
1,positive,@VirginAmerica plus you've added commercials t...,@ virginamerica plu 've ad commerci experi ......
2,neutral,@VirginAmerica I didn't today... Must mean I n...,@ virginamerica n't today ... must mean need t...
3,negative,@VirginAmerica it's really aggressive to blast...,@ virginamerica 's realli aggress blast obnoxi...
4,negative,@VirginAmerica and it's a really big bad thing...,@ virginamerica 's realli big bad thing


# Feature Extraction using TF-IDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [30]:
# Create TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)

# Generate TF-IDF features
X = tfidf.fit_transform(df["text_cleaned"]).toarray()
# Convert labels to array
Y = df["airline_sentiment"].values

# Verify shapes
print("X shape:", X.shape)
print("Y shape:", Y.shape)


X shape: (14640, 3000)
Y shape: (14640,)


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [32]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=2
)

In [33]:
# Multinomial Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)

In [34]:
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.7213114754098361


In [35]:
# Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

In [36]:
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.7479508196721312


Print Accuracy

In [37]:
print("Naive Bayes Accuracy:", nb_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

Naive Bayes Accuracy: 0.7213114754098361
Random Forest Accuracy: 0.7479508196721312
