In [None]:
# Part 1: Using the TextBlob Sentiment Analyzer

In [64]:
import pandas as pd
df = pd.read_csv("labeledTrainData.tsv", delimiter='\t') 
# delimiter \t tells the compiler i am dealing with tab based values and not commas based; tsv vs csv
print(df)

            id  sentiment                                             review
0       5814_8          1  With all this stuff going down at the moment w...
1       2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2       7759_3          0  The film starts with a manager (Nicholas Bell)...
3       3630_4          0  It must be assumed that those who praised this...
4       9495_8          1  Superbly trashy and wondrously unpretentious 8...
...        ...        ...                                                ...
24995   3453_3          0  It seems like more consideration has gone into...
24996   5064_1          0  I don't believe they made this film. Completel...
24997  10905_3          0  Guy is a loser. Can't get girls, needs to buil...
24998  10194_3          0  This 30 minute documentary Buñuel made in the ...
24999   8478_8          1  I saw this movie as a child and it broke my he...

[25000 rows x 3 columns]


In [91]:
# positive and negative reviews
positive_reviews = df[df['sentiment'] == 1].shape[0]
negative_reviews = df[df['sentiment'] == 0].shape[0]

print("Positive reviews:", positive_reviews)
print("Negative reviews:", negative_reviews)

Positive reviews: 12500
Negative reviews: 12500


In [101]:
# function that classify sentiment; uses TextBlob
from textblob import TextBlob
def classify_sentiment(review):
    polarity = TextBlob(review).sentiment.polarity
    if polarity >= 0:
        return 'positive'
    else:
        return 'negative'

# apply sentiment classification to each review
df['predicted_sentiment'] = df['review'].apply(classify_sentiment)

# checks to see if sentiment is added correctly
print(df.head())

       id  sentiment                                             review  \
0  5814_8          1  with all this stuff going down at the moment w...   
1  2381_9          1  \the classic war of the worlds\" by timothy hi...   
2  7759_3          0  the film starts with a manager (nicholas bell)...   
3  3630_4          0  it must be assumed that those who praised this...   
4  9495_8          1  superbly trashy and wondrously unpretentious 8...   

  predicted_sentiment predicted_sentiment_textblob  
0            positive                     positive  
1            positive                     positive  
2            negative                     negative  
3            positive                     positive  
4            negative                     negative  


In [119]:
# calculate accuracy
correct_predictions = (df['sentiment'] == df['predicted_sentiment']).sum()
total_predictions = len(df)
accuracy = correct_predictions / total_predictions

print("Accuracy:", accuracy)

Accuracy: 0.0


In [98]:
print(df[['sentiment', 'predicted_sentiment']].head())
# checks to see which sentiment are posi and neg

   sentiment predicted_sentiment
0          1            positive
1          1            positive
2          0            negative
3          0            positive
4          1            negative


In [68]:
# Part 2: Prepping Text for a Custom Model

In [123]:
# convert text to lowercase
df['review'] = df['review'].str.lower()
df

Unnamed: 0,id,sentiment,review,predicted_sentiment,predicted_sentiment_textblob
0,5814_8,1,with all this stuff going down at the moment w...,positive,positive
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi...",positive,positive
2,7759_3,0,the film starts with a manager (nicholas bell)...,negative,negative
3,3630_4,0,it must be assumed that those who praised this...,positive,positive
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,negative,negative
...,...,...,...,...,...
24995,3453_3,0,it seems like more consideration has gone into...,positive,positive
24996,5064_1,0,i don't believe they made this film. completel...,positive,positive
24997,10905_3,0,"guy is a loser. can't get girls, needs to buil...",positive,positive
24998,10194_3,0,this 30 minute documentary buñuel made in the ...,positive,positive


In [128]:
import re
# import re library 
df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
# removes all of the non-alphebetical, non-digit, and whitespace values
df
# displays what it looks like

Unnamed: 0,id,sentiment,review,predicted_sentiment,predicted_sentiment_textblob
0,5814_8,1,with all this stuff going down at the moment w...,positive,positive
1,2381_9,1,the classic war of the worlds by timothy hines...,positive,positive
2,7759_3,0,the film starts with a manager nicholas bell g...,negative,negative
3,3630_4,0,it must be assumed that those who praised this...,positive,positive
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,negative,negative
...,...,...,...,...,...
24995,3453_3,0,it seems like more consideration has gone into...,positive,positive
24996,5064_1,0,i dont believe they made this film completely ...,positive,positive
24997,10905_3,0,guy is a loser cant get girls needs to build u...,positive,positive
24998,10194_3,0,this 30 minute documentary buuel made in the e...,positive,positive


In [132]:
import nltk
nltk.download('punkt')
# download necessary libraries and packets recieves from punkt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jared\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [133]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# initalize Porter Stemmer
porter = PorterStemmer()

# defines function to apply porter stemmer
def apply_stemming(text):
    # tokenize text
    tokens = word_tokenize(text)
    # aplly stemming to each token
    stemmed_tokens = [porter.stem(token) for token in tokens]
    # join the stemmed tokens back into a string
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

# apply Porter Stemmer to 'review' column
df['review'] = df['review'].apply(apply_stemming)


In [134]:
print(df.head())

       id  sentiment                                             review  \
0  5814_8          1  with all thi stuff go down at the moment with ...   
1  2381_9          1  the classic war of the world by timothi hine i...   
2  7759_3          0  the film start with a manag nichola bell give ...   
3  3630_4          0  it must be assum that those who prai thi film ...   
4  9495_8          1  superbl trashi and wondrou unpretenti 80 explo...   

  predicted_sentiment predicted_sentiment_textblob  
0            positive                     positive  
1            positive                     positive  
2            negative                     negative  
3            positive                     positive  
4            negative                     negative  


In [135]:
from sklearn.feature_extraction.text import CountVectorizer

# initialize CountVectorizer
vectorizer = CountVectorizer()

# fit transform the stemmed text to create matrix for bag-of-words
bow_matrix = vectorizer.fit_transform(df['review'])

# display the dimensions
print("Dimensions of the bag-of-words matrix:", bow_matrix.shape)


Dimensions of the bag-of-words matrix: (25000, 91347)


In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# fit-transform the stemmed text to create the tf-idf matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'])

# Display the dimensions of the matrix
print("Dimensions of the tf-idf matrix:", tfidf_matrix.shape)


Dimensions of the tf-idf matrix: (25000, 91347)
