In [7]:
import nltk
import pandas as pd
import random
from nltk.corpus import movie_reviews

nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

documents = []

for category in movie_reviews.categories():
    fileids = movie_reviews.fileids(category)
    for fileid in fileids:
        review_text = movie_reviews.raw(fileid)
        documents.append((review_text, category))

random.shuffle(documents)


df = pd.DataFrame(documents, columns=['review_text', 'label'])
print(df.sample(5),"\n")



from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    cleaned = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(cleaned)


cleaned_review = preprocess(df['review_text'][0])
print(cleaned_review[:500],"\n")





[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/itachi_uchiha/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/itachi_uchiha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/itachi_uchiha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


('plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience memb

In [10]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_features=50)
bow_matrix = bow_vectorizer.fit_transform(df['review_text'])
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())

print(bow_df.head())

# Top 10 most frequent words
print(bow_df.sum().sort_values(ascending=False).head(10),"\n")



   about  all  an  and  are  as  at  be  but  by  ...  this  to  up  was  \
0      1    0   6   24    4  11   1   3    2   4  ...     0  14   2    4   
1      1    3   7   25    3   5   3   1    6   5  ...     1  17   4    4   
2      1    0   2    9    3   2   2   0    5   1  ...     7  11   0    7   
3      3    1   4   18    1   6   0   3    4   2  ...     1  11   3    0   
4      2    2   4   21    0   7   1   2    6   1  ...     5  14   0   10   

   what  when  which  who  with  you  
0     1     2      0    3     0    0  
1     2     2      5    5     7    1  
2     1     1      0    0     4    4  
3     3     1      2    3     5    0  
4     1     0      0    3     3    3  

[5 rows x 50 columns]
the     76529
and     35576
of      34123
to      31937
is      25195
in      21822
it      16107
that    15924
as      11378
with    10792
dtype: int64 



In [9]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=50)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())



# Top 5 TF-IDF words in first review
print(tfidf_df.iloc[0].sort_values(ascending=False).head(5))

the    0.536203
and    0.495204
to     0.289013
as     0.235993
of     0.206335
Name: 0, dtype: float64


In [11]:
print(tfidf_df.head(),"\n",df.head())

      about       all        an       and       are        as        at  \
0  0.027082  0.000000  0.137736  0.495204  0.092009  0.235993  0.023720   
1  0.017031  0.047488  0.101054  0.324393  0.043396  0.067458  0.044750   
2  0.026971  0.000000  0.045724  0.184941  0.068724  0.042732  0.047246   
3  0.088612  0.027453  0.100149  0.405075  0.025088  0.140394  0.000000   
4  0.046543  0.043260  0.078904  0.372338  0.000000  0.129048  0.020383   

         be       but        by  ...      this        to        up       was  \
0  0.069250  0.043274  0.091962  ...  0.000000  0.289013  0.054164  0.106378   
1  0.014516  0.081641  0.072290  ...  0.013655  0.220698  0.068124  0.066897   
2  0.000000  0.107743  0.022896  ...  0.151369  0.226152  0.000000  0.185398   
3  0.075528  0.094395  0.050150  ...  0.023682  0.247670  0.088612  0.000000   
4  0.039671  0.111557  0.019756  ...  0.093290  0.248350  0.000000  0.228526   

       what      when     which       who      with       you  
0  0