In [2]:
# Import libraries
import re
import pandas as pd

In [3]:
# Import the dataset
path = r'C:\Users\ioann\Desktop\NLP Project\airline_sentiment.csv'
df_full = pd.read_csv(path, encoding='ISO-8859-1')

In [4]:
# Keep only columns of interest and rename them. Remove words preceeded with @
df = df_full[['text', 'airline_sentiment']]
df.columns = ['tweet', 'sentiment']
df.tweet = df.tweet.apply(lambda x : re.sub(r'@\w+', '', x))
df

Unnamed: 0,tweet,sentiment
0,What said.,neutral
1,plus you've added commercials to the experien...,positive
2,I didn't today... Must mean I need to take an...,neutral
3,"it's really aggressive to blast obnoxious ""en...",negative
4,and it's a really big bad thing about it,negative
...,...,...
14635,thank you we got on a different flight to Chi...,positive
14636,leaving over 20 minutes Late Flight. No warni...,negative
14637,Please bring American Airlines to #BlackBerry10,neutral
14638,"you have my money, you change my flight, and ...",negative


In [5]:
# 'negative' is the majority category
df['sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: sentiment, dtype: int64

In [6]:
# Check for missing values
print(df.isnull()['tweet'].value_counts())
print(df.isnull()['sentiment'].value_counts())

False    14640
Name: tweet, dtype: int64
False    14640
Name: sentiment, dtype: int64


In [7]:
# Encode the y variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.sentiment = le.fit_transform(df.sentiment)

In [10]:
# Clean the text
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 14640):
  tweet = re.sub('[^a-zA-Z]', ' ', df['tweet'][i])
  tweet = tweet.lower()
  tweet = tweet.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
  tweet = ' '.join(tweet)
  corpus.append(tweet)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ioann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Create the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values

In [None]:
print(cv)

In [12]:
# Split the dataset into the training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [13]:
# Train the Naive Bayes model on the training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
# Make the predictions
y_pred = classifier.predict(X_test)

In [29]:
# Create the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1744   92   34]
 [ 327  242   45]
 [ 151   43  250]]


0.7636612021857924

In [21]:
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]


In [16]:
new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[2]
