In [1]:
import pandas as pd 
import numpy as np 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string
import nltk

In [2]:
!pip install nltk 
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/kiddi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kiddi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
data=pd.read_csv("Data/IMDB Dataset.csv")

In [5]:
data.head

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [6]:
def preprocessing_data(text):
    text =re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join the words back into one string
    text = ' '.join(tokens)
    
    return text

   

In [7]:
# Apply the cleaning function to the review column
data['cleaned_review'] = data['review'].apply(preprocessing_data)

# Display the first few rows of the cleaned dataframe
data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch 1 oz episod youll hoo...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


## ML models

In [16]:
import mlflow


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['sentiment'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
import joblib
joblib.dump(vectorizer, 'Models/fitted_vectorizer.pkl')

['Models/fitted_vectorizer.pkl']

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


In [14]:
models={
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision tree": DecisionTreeClassifier()
}

In [24]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")


In [25]:
# Function to train and log model
def train_and_log_model(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        # Log model and params
        mlflow.log_param("model_name", model_name)
        mlflow.log_metrics({"accuracy": report['accuracy']})
        mlflow.sklearn.log_model(model, "model")

        return report['accuracy']

# Train and log each model
for name, model in models.items():
    accuracy = train_and_log_model(name, model, X_train_vec, X_test_vec, y_train, y_test)
    print(f"Model: {name}, Accuracy: {accuracy}")



Model: Logistic Regression, Accuracy: 0.885




Model: Naive Bayes, Accuracy: 0.849




Model: Random Forest, Accuracy: 0.8484




Model: SVM, Accuracy: 0.8911
Model: Dicision tree, Accuracy: 0.7164




### Logistic regression

In [8]:

from sklearn.svm import SVC
from sklearn.metrics import classification_report


In [17]:
lr_model = LogisticRegression()
lr_model.fit(X_train_vec, y_train)

# Predict and evaluate the model
predictions = lr_model.predict(X_test_vec)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.90      0.87      0.88      4961
    positive       0.87      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


### SVM

In [None]:
#take a long of timee 

svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vec, y_train)
predictions = svm_model.predict(X_test_vec)
print(classification_report(y_test, predictions))

 ### Random forest 

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_vec, y_train)
rf_predictions = rf_model.predict(X_test_vec)
print(classification_report(y_test, rf_predictions))


              precision    recall  f1-score   support

    negative       0.84      0.86      0.85      4961
    positive       0.86      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


### Naive

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
wwnb_predictions = nb_model.predict(X_test_vec)


NameError: name 'nb_predictions' is not defined

In [20]:
print(classification_report(y_test, wwnb_predictions))


              precision    recall  f1-score   support

    negative       0.85      0.84      0.85      4961
    positive       0.85      0.86      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


### Desicion tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_vec, y_train)
dt_predictions = dt_model.predict(X_test_vec)
print(classification_report(y_test, dt_predictions))


              precision    recall  f1-score   support

    negative       0.71      0.71      0.71      4961
    positive       0.71      0.71      0.71      5039

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000


In [26]:
!pip install onnx skl2onnx


Collecting onnx
  Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting skl2onnx
  Downloading skl2onnx-1.15.0-py2.py3-none-any.whl.metadata (2.3 kB)
Collecting onnxconverter-common>=1.7.0 (from skl2onnx)
  Downloading onnxconverter_common-1.14.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting protobuf>=3.20.2 (from onnx)
  Downloading protobuf-3.20.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m152.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m136.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hDownloading skl2onnx-1.15.0-py2.py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.7/294.7 k

In [44]:
pip install --upgrade skl2onnx onnx protobuf


Defaulting to user installation because normal site-packages is not writeable
Collecting protobuf
  Using cached protobuf-4.25.1-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Note: you may need to restart the kernel to use updated packages.


In [26]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [('float_input', FloatTensorType([None, X_train.shape[0]]))]

onnx_model = convert_sklearn(rf_model, initial_types=initial_type)

# Save the ONNX model to a file
onnx_file_path = "rf_model.onnx"
with open(onnx_file_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print("Model saved as ONNX:", onnx_file_path)

Model saved as ONNX: rf_model.onnx


In [34]:
import pickle

with open('preprocessing.pkl', 'wb') as f:
    pickle.dump(data, f)

### Fast API 

In [37]:
!pip install fastapi

Defaulting to user installation because normal site-packages is not writeable
Collecting fastapi
  Downloading fastapi-0.104.1-py3-none-any.whl (92 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 KB[0m [31m752.5 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hCollecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4
  Downloading pydantic-2.5.2-py3-none-any.whl (381 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.9/381.9 KB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting starlette<0.28.0,>=0.27.0
  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anyio<4.0.0,>=3.7.1
  Downloading anyio-3.7.1-py3-none-any.whl (80 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━