In [43]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

### Cleaning

In [2]:
row_data = pd.read_csv('Datasets/1-SentimentAna-movie_review.csv')
row_data.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [3]:
# remove unwanted cols
row_data.drop(['fold_id', 'cv_tag', 'html_id', 'sent_id'], axis=1, inplace=True)
row_data.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos


In [4]:
# one hot
onehot = CountVectorizer()
Y = onehot.fit_transform(row_data.tag)


In [5]:
Y = Y.toarray()

In [6]:
onehot.get_feature_names_out()

array(['neg', 'pos'], dtype=object)

In [7]:
Y = pd.DataFrame(Y, columns=['Neg', 'Pos'])

In [8]:
Y.head()

Unnamed: 0,Neg,Pos
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [9]:
# merge to Dataframes
data = pd.concat([row_data, Y], axis=1)
data.head()

Unnamed: 0,text,tag,Neg,Pos
0,films adapted from comic books have had plenty...,pos,0,1
1,"for starters , it was created by alan moore ( ...",pos,0,1
2,to say moore and campbell thoroughly researche...,pos,0,1
3,"the book ( or "" graphic novel , "" if you will ...",pos,0,1
4,"in other words , don't dismiss this film becau...",pos,0,1


### Stopword Removing & Stemming

In [10]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
import string

In [11]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")
puch = list(string.punctuation)

In [12]:
def stop_words_stemming(text):
    if isinstance(text, str):
        words = word_tokenize(text)
        # Remove stopwords, punctuation, and apply stemming
        review_clean = [stemmer.stem(wd) for wd in words if wd.lower() not in stop_words and wd not in puch]
        return ' '.join(review_clean)
    else:
        return None

In [13]:
data['process_text'] = data.text.apply(stop_words_stemming)

In [14]:
data[['process_text', 'Neg', 'Pos']]

Unnamed: 0,process_text,Neg,Pos
0,film adapt comic book plenti success whether r...,0,1
1,starter creat alan moor eddi campbel brought m...,0,1
2,say moor campbel thorough research subject jac...,0,1
3,book `` graphic novel `` 500 page long includ ...,0,1
4,word n't dismiss film sourc,0,1
...,...,...,...
64715,lack inspir trace back insipid charact,1,0
64716,like mani skit current incarn _saturday_night_...,1,0
64717,watch one `` roxburi `` skit snl come away cha...,1,0
64718,bump unsuspect women 's,1,0


### TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfIDF = TfidfVectorizer()

X = tfIDF.fit_transform(data['process_text'])

X.shape

(64720, 26806)

In [17]:
text_TFIDF = pd.DataFrame(X.toarray(), columns=tfIDF.get_feature_names_out())

text_TFIDF

Unnamed: 0,00,000,0009f,007,00s,03,04,05,05425,10,...,zukovski,zulu,zundel,zurg,zus,zweibel,zwick,zwigoff,zyci,zzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# combain X and Y
# Concatenate TF-IDF features with label columns
final_df = pd.concat([text_TFIDF, data[['Neg', 'Pos']].reset_index(drop=True)], axis=1)

# Display the first few rows of the final DataFrame
final_df

Unnamed: 0,00,000,0009f,007,00s,03,04,05,05425,10,...,zundel,zurg,zus,zweibel,zwick,zwigoff,zyci,zzzzzzz,Neg,Pos
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
64716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
64717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
64718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


### Data Prepare and ML Model

In [23]:
from sklearn.model_selection import train_test_split

# Separate features and target variables
X = final_df.drop(columns=['Neg', 'Pos'])  # Features (TF-IDF matrix)
# y = final_df[['Neg', 'Pos']]  # Target variables
y = (final_df['Neg'] * 1) + (final_df['Pos'] * 2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
y_test.shape

(12944,)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [31]:
y_pred


array([2, 1, 2, ..., 2, 2, 1])

In [32]:
# Evaluate the model performance
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.69      0.67      0.68      6371
    Positive       0.69      0.70      0.70      6573

    accuracy                           0.69     12944
   macro avg       0.69      0.69      0.69     12944
weighted avg       0.69      0.69      0.69     12944



### Test on new Reviews

In [41]:
new_reviews = [
    "The movie was absolutely fantastic and exceeded my expectations!",
    "I found the film quite boring and didn't enjoy it at all.",
    "The acting was brilliant, but the plot was a bit predictable.",
    "An unforgettable experience with stunning visuals and a compelling story.",
    "I was disappointed with the movie; it didn't live up to the hype.",
    "A remarkable film with an incredible soundtrack and superb performances.",
    "The film was okay, but I felt the pacing was a bit slow.",
    "I loved every moment of it; definitely one of the best movies of the year!",
    "The plot was convoluted, and the characters were not well-developed.",
    "A fantastic movie with a great mix of action and emotional depth."
]

# Target variables: 1 for 'Neg', 2 for 'Pos'
target_variable = [2, 1, 2, 2, 1, 2, 1, 2, 1, 2]

In [34]:
processed_reviews = [stop_words_stemming(review) for review in new_reviews]

In [35]:
X_new_review = tfIDF.transform(processed_reviews)

In [39]:
new_pred = model.predict(X_new_review.toarray())



In [40]:
new_pred

array([2, 1, 2, 2, 1, 2, 2, 2, 1, 2])

In [44]:
y_true = np.array(target_variable)

# Print the classification report
print(classification_report(y_true, new_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       1.00      0.75      0.86         4
    Positive       0.86      1.00      0.92         6

    accuracy                           0.90        10
   macro avg       0.93      0.88      0.89        10
weighted avg       0.91      0.90      0.90        10

