In [38]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import nltk
import re
import string
import contractions
import emoji 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab', download_dir='./nltk_data')
nltk.download('stopwords', download_dir='./nltk_data')
nltk.download('wordnet')
nltk.download('omw-1.4')

nltk.data.path.append('./nltk_data')

[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ab/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [39]:
df= pd.read_csv("twitter.csv")
print(df)

                 tweet_id airline_sentiment  airline_sentiment_confidence  \
0      570306133677760513           neutral                        1.0000   
1      570301130888122368          positive                        0.3486   
2      570301083672813571           neutral                        0.6837   
3      570301031407624196          negative                        1.0000   
4      570300817074462722          negative                        1.0000   
...                   ...               ...                           ...   
14635  569587686496825344          positive                        0.3487   
14636  569587371693355008          negative                        1.0000   
14637  569587242672398336           neutral                        1.0000   
14638  569587188687634433          negative                        1.0000   
14639  569587140490866689           neutral                        0.6771   

               negativereason  negativereason_confidence         airline  \

In [40]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
 tokens = word_tokenize(text.lower())  
 filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
 return filtered



In [41]:
def preprocess_tweet(text):
    text = text.lower()
    
    text = contractions.fix(text)

    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  
    text = re.sub(r"@\w+", '', text)  
    text = re.sub(r"#", '', text) 
    text = emoji.replace_emoji(text, replace='')  

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)

    cleaned = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalpha() and word not in stop_words
    ]

    return cleaned

In [42]:
df["tokens"] = df["text"].apply(preprocess_tweet)
print(df["tokens"])


0                                                   [said]
1             [plus, added, commercial, experience, tacky]
2           [today, must, mean, need, take, another, trip]
3        [really, aggressive, blast, obnoxious, enterta...
4                                [really, big, bad, thing]
                               ...                        
14635             [thank, got, different, flight, chicago]
14637                   [please, bring, american, airline]
14638    [money, change, flight, answer, phone, suggest...
14639    [people, need, know, many, seat, next, flight,...
Name: tokens, Length: 14640, dtype: object


In [45]:
w2v_model = api.load("word2vec-google-news-300")

In [46]:
def text_to_vector(tokens, model, dim=300):
    vectors = [model[word] for word in tokens if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(dim)


In [52]:
df["tokens"] = df["text"].apply(preprocess_tweet)
df["vector"] = df["tokens"].apply(lambda x: text_to_vector(x, w2v_model))

df = df[df["vector"].apply(lambda x: x.any())]


X = np.vstack(df["vector"].values)
y = df["airline_sentiment"].values 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))



Test Accuracy: 0.768255056564964


In [49]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = preprocess_tweet(tweet)
    vector = text_to_vector(tokens, w2v_model).reshape(1, -1)
    return model.predict(vector)[0]
test_tweet = "I'm not sure how I feel about this service."
print("Predicted Sentiment:", predict_tweet_sentiment(clf, w2v_model, test_tweet))
test_tweet_2= "best flight i ever took"
print("Predicted Sentiment:", predict_tweet_sentiment(clf, w2v_model, test_tweet_2))


Predicted Sentiment: negative
Predicted Sentiment: positive
