In [25]:
import pandas as pd
import nltk
import string
from textblob import TextBlob
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mattl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mattl\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [26]:
#Creating the Data Frame
file_path = 'C:/Users/mattl/OneDrive/Documents/GitHub/DSC550/labeledTrainData.tsv'
df = pd.read_csv(file_path, sep='\t')

print(df.head())

       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


In [27]:
#Using Text Blob to get the number of Positive and Negative Reviews
positive_reviews = df[df['review'].apply(lambda x: TextBlob(x).sentiment.polarity >= 0)]
negative_reviews = df[df['review'].apply(lambda x: TextBlob(x).sentiment.polarity < 0)]

print(f"Number of Positive Reviews: {len(positive_reviews)}")
print(f"Number of Negative Reviews: {len(negative_reviews)}")


Number of Positive Reviews: 19017
Number of Negative Reviews: 5983


In [28]:
# Split the dataset into a training set (80%) and a test set (20%)
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Define a function to classify using TextBlob and convert polarity to sentiment label
def classify_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity >= 0:
        return 'positive'
    else:
        return 'negative'

# Apply TextBlob's sentiment analysis to the test set
y_pred = X_test.apply(classify_sentiment)

# Convert numerical labels to strings in y_test for accurate comparison
y_test = y_test.map({0: 'negative', 1: 'positive'})

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of TextBlob Model: {accuracy:.2f}")
print(f"This model is more accurate then randomly guessing")

Accuracy of TextBlob Model: 0.69
This model is more accurate then randomly guessing


In [34]:
# Define a function to classify using VADER and convert compound score to sentiment label
def classify_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)
    if sentiment_score['compound'] >= 0:
        return 'positive'
    else:
        return 'negative'

# Apply VADER's sentiment analysis to the entire dataset
df['predicted_sentiment_vader'] = df['review'].apply(classify_sentiment_vader)

# Convert numerical labels to strings in y_test for accurate comparison
y_test = df['sentiment'].map({0: 'negative', 1: 'positive'})

# Calculate accuracy for VADER Model
accuracy_vader = accuracy_score(y_test, df['predicted_sentiment_vader'])

print(f"Accuracy of the VADER model: {accuracy_vader:.2f}")
print(f"This model is more accurate then randomly guessing.")

Accuracy of the VADER model: 0.69
This model is more accurate then randomly guessing.


In [6]:
#Part 2
df['review'] = df['review'].str.lower()

In [7]:
#Removing Punctuation
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])

df['review'] = df['review'].apply(remove_punctuation)

In [8]:
#Utilizing Stop Words
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

df['review'] = df['review'].apply(remove_stopwords)

In [9]:
from nltk.stem import PorterStemmer

#Apply Porter Stemmer
stemmer = PorterStemmer()

def apply_stemming(text):
    words = text.split()
    return ' '.join([stemmer.stem(word) for word in words])

df['review'] = df['review'].apply(apply_stemming)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words (Count Vectorizer)
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(df['review'])
print(f"Bag of Words Matrix Dimensions: {bow_matrix.shape}")



Bag of Words Matrix Dimensions: (25000, 74849)


In [22]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'])
print(f"TF-IDF Matrix Dimensions: {tfidf_matrix.shape}")

TF-IDF Matrix Dimensions: (25000, 74849)
