In [1]:
import json
import pandas as pd

In [2]:
files = 'Books_small_10000.json'
with open (files,'r') as file:
    data  = [json.loads(line) for line in file]

df = pd.DataFrame(data)
df.head()
df[['reviewText', 'overall']].head()

Unnamed: 0,reviewText,overall
0,"I bought both boxed sets, books 1-5. Really a...",5.0
1,I enjoyed this short book. But it was way way ...,3.0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0
3,I really enjoyed this adventure and look forwa...,4.0
4,It was a decent read.. typical story line. Not...,3.0


In [3]:
df['Sentiment'] = df['overall'].apply(lambda x: 1 if x>=4 else 0)
df_reviews = df[['reviewText', 'Sentiment']]
df_reviews.head()

Unnamed: 0,reviewText,Sentiment
0,"I bought both boxed sets, books 1-5. Really a...",1
1,I enjoyed this short book. But it was way way ...,0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,1
3,I really enjoyed this adventure and look forwa...,1
4,It was a decent read.. typical story line. Not...,0


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_reviews['reviewText'],df_reviews['Sentiment'], test_size=0.33, random_state=42)
X_train.shape,y_train.shape, X_test.shape, y_test.shape

((6700,), (6700,), (3300,), (3300,))

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess_text(text):
    words = word_tokenize(text.lower())
    processed_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(processed_words)

In [9]:
X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)

In [10]:
print(X_train.head(3))
print(X_train_processed.head(3))

8371    Olivia Hampton arrives at the Dunraven family ...
5027    Perhaps one of the funniest, yet saddest stori...
9234            One of Francine Rivers best series books!
Name: reviewText, dtype: object
8371    olivia hampton arrives dunraven family home ca...
5027    perhaps one funniest yet saddest story ever re...
9234                  one francine river best series book
Name: reviewText, dtype: object


In [11]:
#confimred that stopwords/lemmatized worked in above output.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [13]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_processed)
X_test_tfidf = vectorizer.transform(X_test_processed)

In [14]:
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train_tfidf, y_train)

In [15]:
y_train

8371    1
5027    1
9234    1
3944    0
6862    1
       ..
5734    1
5191    1
5390    1
860     1
7270    1
Name: Sentiment, Length: 6700, dtype: int64

In [16]:
log_reg_pred = log_reg_model.predict(X_test_tfidf)
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)

In [17]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [18]:
svm_pred = svm_model.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_pred)

In [19]:
from sklearn.tree import DecisionTreeClassifier
dtr = DecisionTreeClassifier(random_state=42)
dtr.fit(X_train_tfidf, y_train)

In [20]:
decision_tree_pred = dtr.predict(X_test_tfidf)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_pred)

In [21]:
log_reg_accuracy, svm_accuracy, decision_tree_accuracy

(0.8593939393939394, 0.8721212121212121, 0.7806060606060606)

In [22]:
#creating a function to predict sentiment of a string
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    text_vectorizer = vectorizer.transform([processed_text])
    sentiment = svm_model.predict(text_vectorizer)
    return 'Positive' if sentiment == 1 else 'Negative'

In [23]:
predict_sentiment('the story was too long')

'Positive'

In [24]:
import pickle

In [26]:
model_vectorizer = {'model': svm_model, 'vectorizer': vectorizer}
with open ('model_vectorizer','wb') as file:
    pickle.dump(model_vectorizer, file)

In [29]:
with open('model_vectorizer', 'rb') as file:
    loaded_objects = pickle.load(file)

loaded_svm_model = loaded_objects['model']
loaded_vectorizer = loaded_objects['vectorizer']

In [38]:
def predict_sentiment_pickle(text, vectorizer, model):
    # Preprocess the input text
    processed_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([processed_text])  
    sentiment = model.predict(text_tfidf)
    return "Positive" if sentiment == 1 else "Negative"

In [39]:
text = "The story was too long."
prediction = predict_sentiment_pickle(text,loaded_vectorizer,loaded_svm_model)
print(f"Sentiment: {prediction}")

Sentiment: Positive
