Sentiment Classification: IMDb Mini-Project

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('Dataset/IMDB Dataset.csv')

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
# Count the sentiment values
sentiment_counts = data['sentiment'].value_counts()

print(sentiment_counts)

sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [6]:
# Separate by sentiment
positive_reviews = data[data['sentiment'] == 'positive'].sample(2500, random_state=42)
negative_reviews = data[data['sentiment'] == 'negative'].sample(2500, random_state=42)

# Combine them
balanced_data = pd.concat([positive_reviews, negative_reviews]).sample(frac=1, random_state=42).reset_index(drop=True)


print(balanced_data['sentiment'].value_counts())

sentiment
positive    2500
negative    2500
Name: count, dtype: int64


In [8]:
# Save to CSV
balanced_data.to_csv('Dataset/imdb_balanced_5000.csv', index=False)

In [9]:
data = pd.read_csv('Dataset/imdb_balanced_5000.csv')

data.head()

Unnamed: 0,review,sentiment
0,"Why would this film be so good, but only gross...",positive
1,"The name ""cult movie"" is often given to films ...",negative
2,Another stinker from the PM Entertainment grou...,negative
3,It really isn't hard to understand this movie!...,positive
4,"Make no mistake, Maureen O'Sullivan is easily ...",positive


In [10]:
# Encode labels: positive → 1, negative → 0
data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [12]:
data = data.drop(columns=['sentiment'])

data.head()

Unnamed: 0,review,label
0,"Why would this film be so good, but only gross...",1
1,"The name ""cult movie"" is often given to films ...",0
2,Another stinker from the PM Entertainment grou...,0
3,It really isn't hard to understand this movie!...,1
4,"Make no mistake, Maureen O'Sullivan is easily ...",1


In [13]:
from sklearn.model_selection import train_test_split

X = data['review']
y = data['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_text(input_text):
    vectorizer = TfidfVectorizer(stop_words='english')
    vectorized_text = vectorizer.fit_transform(input_text)
    return vectorized_text, vectorizer

In [15]:
vectorized_X_train, vectorizer = vectorize_text(X_train)
vectorized_X_test = vectorizer.transform(X_test)

In [16]:
df_tfidf_train = pd.DataFrame.sparse.from_spmatrix(vectorized_X_train,index=X_train.index, columns=vectorizer.get_feature_names_out())

print(df_tfidf_train.head())

      00       000       001  007  00am  00o  00s  01  02  05  ...  zurlini  \
4227   0         0         0    0     0    0    0   0   0   0  ...        0   
4676   0         0         0    0     0    0    0   0   0   0  ...        0   
800    0         0         0    0     0    0    0   0   0   0  ...        0   
3671   0  0.064594  0.095827    0     0    0    0   0   0   0  ...        0   
4193   0         0         0    0     0    0    0   0   0   0  ...        0   

      zwrite  zzzzzzzzzzzz  zzzzzzzzzzzzzzzzzz  zázvorková  émigré  était  \
4227       0             0                   0           0       0      0   
4676       0             0                   0           0       0      0   
800        0             0                   0           0       0      0   
3671       0             0                   0           0       0      0   
4193       0             0                   0           0       0      0   

      óli  über  übermensch  
4227    0     0           0  
46

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(vectorized_X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

pred = model.predict(vectorized_X_test)

accuracy = accuracy_score(y_test, pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8300


In [19]:
def predict_sentiment(text, model, vectorizer):
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    prob = model.predict_proba(vec)[0][pred]
    sentiment = "positive" if pred == 1 else "negative"
    print(f"{sentiment} ({prob:.2f} confidence)")

In [20]:
review = "I loved this movie!"
predict_sentiment(review, model, vectorizer)

positive (0.86 confidence)


In [21]:
import joblib


# Save model
joblib.dump(model, 'model.pkl')

# Save vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved!")

Model and vectorizer saved!


In [22]:
!pip install flask

Collecting flask
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting blinker>=1.9.0 (from flask)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting click>=8.1.3 (from flask)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting itsdangerous>=2.2.0 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting jinja2>=3.1.2 (from flask)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting markupsafe>=2.1.1 (from flask)
  Using cached MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting werkzeug>=3.1.0 (from flask)
  Using cached werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading flask-3.1.1-py3-none-any.whl (103 kB)
Using cached blinker-1.9.0-py3-none-any.whl (8.5 kB)
Using cached click-8.2.1-py3-none-any.whl (102 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Using cached jinja2-3.1.6-py3-none-any.whl (134 kB)
Using cached MarkupSaf


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
!pip install requests

Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.6.15-py3-none-any.whl.metadata (2.4 kB)
Downloading requests-2.32.4-py3-none-any.whl (64 kB)
Downloading certifi-2025.6.15-py3-none-any.whl (157 kB)
Using cached charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl (105 kB)
Using cached idna-3.10-py3-none-any.whl (70 kB)
Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests
Successfully installed certifi-2025.6.15 charset_normalizer-3.4.2 idna-3.10 requests-2.32.4 u


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
pip install pipreqs

Collecting pipreqs
  Downloading pipreqs-0.5.0-py3-none-any.whl.metadata (7.9 kB)
Collecting docopt==0.6.2 (from pipreqs)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting ipython==8.12.3 (from pipreqs)
  Downloading ipython-8.12.3-py3-none-any.whl.metadata (5.7 kB)
Collecting nbconvert<8.0.0,>=7.11.0 (from pipreqs)
  Downloading nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting yarg==0.1.9 (from pipreqs)
  Downloading yarg-0.1.9-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting backcall (from ipython==8.12.3->pipreqs)
  Downloading backcall-0.2.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting pickleshare (from ipython==8.12.3->pipreqs)
  Down


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Downloading cachetools-6.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Using cached protobuf-6.31.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Using cached pyarrow-20.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Using cached watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Using cached GitPython-3.1.44-py3-none-any.whl.metadata (1


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
