<img src="images/aaib.PNG" style="width:400px;height:250px;">

# In this notebook:

1. **Sentiment analysis** using bag of words
\
&nbsp;
2. **Topic modelling** using TF-IDF vectors

# Setup

In [1]:
# Commonly used libraries
import numpy as np
import pandas as pd

# From ScikitLearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import CountVectorizer


# Sentiment analysis

Dataset: [Twitter US Airline Sentiment](https://https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment)

Sentiment can be negative, positive or neutral

In [4]:
data = pd.read_csv('datasets/Tweets.csv')
data.head(10)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
6,570300616901320704,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada)
7,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,,2015-02-24 11:12:29 -0800,Los Angeles,Pacific Time (US & Canada)
8,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D",,2015-02-24 11:11:19 -0800,San Diego,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


In [6]:
# We drop all columns but ID, sentiment, and text
data = data[['tweet_id', 'airline_sentiment', 'text']]
data.head(10)

Unnamed: 0,tweet_id,airline_sentiment,text
0,570306133677760513,neutral,@VirginAmerica What @dhepburn said.
1,570301130888122368,positive,@VirginAmerica plus you've added commercials t...
2,570301083672813571,neutral,@VirginAmerica I didn't today... Must mean I n...
3,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...
4,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...
5,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...
6,570300616901320704,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,570300248553349120,neutral,@VirginAmerica Really missed a prime opportuni...
8,570299953286942721,positive,"@virginamerica Well, I didn't…but NOW I DO! :-D"
9,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ..."


In [11]:

# Now we use a bag of word vectorizer that can turn a text into a vector
vectorizer = CountVectorizer()
vectorizer.fit(data.text)

# How many words do we have in our corpus?
print("Dimensions of the vectors: " + str(vectorizer.get_feature_names_out().size))

# Show a random word
print("\nDimension number 2000 represents the word: " + vectorizer.get_feature_names_out()[2000])

# Convert all texts into BoW representation
X = vectorizer.transform(data.text)
print("\n" + str(X.shape))

Dimensions of the vectors: 11724

Dimension number 2000 represents the word: battling

(9155, 11724)


In [12]:
# Let's split the data to training and test (you already know this)
X_train, X_test, y_train, y_test = train_test_split(X, data.airline_sentiment, test_size = 0.2, random_state = 0)

# Let's train an SVM to do the sentiment analysis
clf_svm = svm.SVC(max_iter=1000, gamma='scale', kernel = "rbf", random_state=0)
clf_svm.fit(X_train, y_train)



In [20]:
# Use the SVM to make predictions for the test set
y_pred = clf_svm.predict(X_test)

# Let's evaluate the performance
print("Confusion matrix")
print(clf_svm.classes_)

bow_confusion_matrix = confusion_matrix(y_test, y_pred)
print(bow_confusion_matrix)

bow_accuracy =accuracy_score(y_test, y_pred)

print("\nAccuracy")
print(bow_accuracy)

Confusion matrix
['negative' 'neutral' 'positive']
[[826 172  19]
 [104 336  28]
 [ 71  90 185]]

Accuracy
0.7356635718186784


# Topic Modelling

We use the [20 newsgroups text dataset](https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset), a standard dataset provided by scikit consisting of 18.846 texts from 20 topics

In [None]:
data_train = fetch_20newsgroups(subset='train')

In [None]:
print("Topics: " + str(data_train.target_names))

print("\nData example:")
print(data_train.target_names[data_train.target[200]]) # we choose an unsupervised approach so we won't need them
print(data_train.data[200])

In [None]:
# Tf-idf Vectorizer to turn the texts into Tf-idf vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_train.data)

In [None]:
# What are the most important words in a document?
feature_array = np.array(vectorizer.get_feature_names_out())

tfidf_sorting = np.argsort(X[200].toarray()).flatten()[::-1]
n = 5
top_n = feature_array[tfidf_sorting][:n]
print(top_n)


In [None]:
# Now cluster them

kmeans = KMeans(n_clusters=6, random_state=0, n_init="auto").fit(X)


# tf-idf for sentiment analysis (Task 2)

In [13]:
# Load dataset
data = pd.read_csv('datasets/Tweets.csv')
data = data[['tweet_id', 'airline_sentiment', 'text']]

In [14]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(data.text)
X_tfidf = tfidf_vectorizer.transform(data.text)

In [15]:
# Split data into training and test sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, data.airline_sentiment, test_size=0.2, random_state=0)

In [16]:
# Train SVM with the TF-IDF vectors
clf_svm_tfidf = svm.SVC(max_iter=1000, gamma='scale', kernel="rbf", random_state=0)
clf_svm_tfidf.fit(X_train_tfidf, y_train)



In [38]:
specific_sentence = "meh"

specific_sentence_tfidf = tfidf_vectorizer.transform([specific_sentence])

# Use the trained SVM model to predict the sentiment of the specific sentence
predicted_sentiment = clf_svm_tfidf.predict(specific_sentence_tfidf)
print("The predicted sentiment of the sentence is:", predicted_sentiment[0])


The predicted sentiment of the sentence is: neutral


In [17]:
y_pred_tfidf = clf_svm_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

In [18]:
print("TF-IDF Accuracy:", accuracy_tfidf)
print("TF-IDF Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tfidf))

TF-IDF Accuracy: 0.764609503003823
TF-IDF Confusion Matrix:
 [[906  95  16]
 [139 302  27]
 [ 93  61 192]]


In [21]:
# Comparing with BoW results
# Assuming BoW accuracy has been stored in a variable `accuracy_bow`
print("Bag of Words Accuracy:", bow_accuracy)
print("Bag of Words Confusion Matrix: ", bow_confusion_matrix )

Bag of Words Accuracy: 0.7356635718186784
Bag of Words Confusion Matrix:  [[826 172  19]
 [104 336  28]
 [ 71  90 185]]
