# **SENTIMENT ANALYSIS**


##### **BUSINESS PROBLEM**

In [26]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords', download_dir='C:\\Users\\marcu\\AppData\\Roaming\\nltk_data')
nltk.download('punkt', download_dir='C:\\Users\\marcu\\AppData\\Roaming\\nltk_data')

df = pd.read_csv("Tweets.csv")

# Keep only relevant columns
df = df[['airline_sentiment', 'text']]

# Load stopwords once
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    try:
        text = str(text).lower()  # Make sure it's a string
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'\@\w+|\#','', text)
        text = re.sub(r'[^\w\s]', '', text)
        tokens = text.split()  # Use split instead of nltk tokenize
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error cleaning text: {text} -> {e}")
        return ""

try:
    df['clean_text'] = df['text'].apply(clean_text)
except Exception as e:
    print("Error:", e)

# Encode labels
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['airline_sentiment'])

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
# Added bi-grams to help the model understand context better
# A bi-gram is a sequence of two adjacent elements from a string of tokens, which can help capture context better than unigrams alone.
X = vectorizer.fit_transform(df['clean_text']).toarray()

# Define features and labels
y = df['sentiment_encoded']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### **LOGISTIC REGRESSION**


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predictions
y_pred_logreg = logreg.predict(X_test)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))


Logistic Regression Accuracy: 0.7978142076502732
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.94      0.88      1889
           1       0.65      0.49      0.56       580
           2       0.83      0.61      0.70       459

    accuracy                           0.80      2928
   macro avg       0.77      0.68      0.71      2928
weighted avg       0.79      0.80      0.79      2928



##### **RANDOM FOREST**

In [28]:
# from sklearn.ensemble import RandomForestClassifier

# # Initialize and train Random Forest model
# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf.fit(X_train, y_train)

# # Predictions
# y_pred_rf = rf.predict(X_test)

# # Evaluation
# print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


##### **FINE-TUNING THE MODEL TO INCREASE PERFORMANCE**

##### **SAVING THE MODEL FILE**

In [None]:
best_model = LogisticRegression()
best_model.fit(X_train, y_train)

# Save the model and vectorizer
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)