In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("Tweets.csv")

# Encode sentiments
df['airline_sentiment'].replace({"positive": 1, "negative": -1, "neutral": 0}, inplace=True)

# Check encoding
df['airline_sentiment'].replace({1: "positive", -1: "negative", 0: "neutral"}, inplace=True)

# Import NLTK
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

# Clean text
import re
cleaned = []
for text in df['text']:  
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra spaces
    cleaned.append(cleaned_text)

# Tokenization
tokens = [word_tokenize(x) for x in cleaned]

# Remove stopwords
stop = set(stopwords.words('english'))
stpktn = []
for k in range(len(df['text'])):  
    p = [i for i in tokens[k] if i not in stop]  
    stpktn.append(p)

# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemed_data = []
for Text in stpktn:
    stm = [ps.stem(word) for word in Text]
    stemed_data.append(stm)

# Convert stemmed data into strings
stm = [" ".join(i) for i in stemed_data]

# Feature extraction using CountVectorizer (Incorrect)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000)
vc = cv.fit(stm)

# Feature extraction using TF-IDF (Better choice)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
x = vectorizer.fit_transform(stm).toarray()

# Save vectorizer
import pickle
with open("vectorizer.pickle", "wb") as vcfile:
    pickle.dump(vectorizer, vcfile)

# Target variable
y = df["airline_sentiment"]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model training using Naïve Bayes
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB()
mb.fit(X_train, y_train)

# Model evaluation
from sklearn.metrics import accuracy_score, classification_report
y_pred = mb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the trained model if accuracy is good
if accuracy_score(y_test, y_pred) > 0.70:
    with open("model.pickle", "wb") as model_file:
        pickle.dump(mb, model_file)
else:
    print("Model needs retraining! Accuracy too low.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.762636612021858
              precision    recall  f1-score   support

    negative       0.75      0.98      0.85      1889
     neutral       0.71      0.30      0.42       580
    positive       0.89      0.47      0.62       459

    accuracy                           0.76      2928
   macro avg       0.79      0.58      0.63      2928
weighted avg       0.77      0.76      0.73      2928

