In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import json

In [17]:
# Constants for preprocessing
csv_file = "pre_processed_reviews_svm.csv"
json_file = open('yelp_dataset/yelp_academic_dataset_review.json', 'r', encoding='utf-8')

# Step 1: Preprocessing JSON data and saving to CSV
def pre_process_json():
    object_count = 0
    for line in json_file:
        try:
            if object_count == 10000:
                break

            # Converting each JSON object into the valid JSON format
            data = json.loads(line)

            # Data Transformation
            stars = int(data['stars'])
            useful = int(data['useful'])
            funny = int(data['funny'])
            cool = int(data['cool'])
            text = str(data['text'])

            # Feature extraction
            neutral = funny + cool

            if useful == 0 or funny == 0 or cool == 0:
                continue
            else:
                sentiment_list = [useful, neutral]
                sentiment = sentiment_list.index(max(sentiment_list))
                if stars >= 3 and sentiment == 0:
                    reaction = 'satisfied considerable comment'
                elif stars >= 3 and sentiment == 1:
                    reaction = 'satisfied neutral comment'
                elif stars < 3 and sentiment == 0:
                    reaction = 'unsatisfied considerable comment'
                elif stars < 3 and sentiment == 1:
                    reaction = 'unsatisfied neutral comment'

            data = {'text': [text], 'Sentiment': [reaction]}
            df = pd.DataFrame(data)

            # Append the data to the CSV file
            if object_count == 0:
                df.to_csv(csv_file, index=False)
            else:
                df.to_csv(csv_file, mode='a', header=False, index=False)

            object_count += 1

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue

# Call the preprocessing function
pre_process_json()
json_file.close()

# Load the processed data to verify
processed_data = pd.read_csv(csv_file)
print(processed_data.head())


                                                text  \
0  I am a long term frequent customer of this est...   
1  HOLY SMOKES!\n\nactual pumpkin pie mixed in wi...   
2  I thoroughly enjoyed the show.  Chill way to s...   
3  On a scale of one to things that are awesome, ...   
4  I've only had the cannolis here but they are a...   

                        Sentiment  
0     unsatisfied neutral comment  
1       satisfied neutral comment  
2       satisfied neutral comment  
3  satisfied considerable comment  
4       satisfied neutral comment  


In [18]:
# Step 2: Load preprocessed data
data = pd.read_csv(csv_file)
X = data['text']
y = data['Sentiment']

data

Unnamed: 0,text,Sentiment
0,I am a long term frequent customer of this est...,unsatisfied neutral comment
1,HOLY SMOKES!\n\nactual pumpkin pie mixed in wi...,satisfied neutral comment
2,I thoroughly enjoyed the show. Chill way to s...,satisfied neutral comment
3,"On a scale of one to things that are awesome, ...",satisfied considerable comment
4,I've only had the cannolis here but they are a...,satisfied neutral comment
...,...,...
995,I think 312 refers to the number of minutes yo...,unsatisfied neutral comment
996,"They have a few locations, and we picked the o...",satisfied considerable comment
997,Had dinner here on a recent Friday evening. G...,unsatisfied neutral comment
998,I have tried all the big named and $$$ dry cle...,satisfied considerable comment


In [19]:
X

0      I am a long term frequent customer of this est...
1      HOLY SMOKES!\n\nactual pumpkin pie mixed in wi...
2      I thoroughly enjoyed the show.  Chill way to s...
3      On a scale of one to things that are awesome, ...
4      I've only had the cannolis here but they are a...
                             ...                        
995    I think 312 refers to the number of minutes yo...
996    They have a few locations, and we picked the o...
997    Had dinner here on a recent Friday evening.  G...
998    I have tried all the big named and $$$ dry cle...
999    I've only been to Lex one time but I had the t...
Name: text, Length: 1000, dtype: object

In [20]:
y

0         unsatisfied neutral comment
1           satisfied neutral comment
2           satisfied neutral comment
3      satisfied considerable comment
4           satisfied neutral comment
                    ...              
995       unsatisfied neutral comment
996    satisfied considerable comment
997       unsatisfied neutral comment
998    satisfied considerable comment
999    satisfied considerable comment
Name: Sentiment, Length: 1000, dtype: object

In [21]:
# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Feature extraction using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features for better performance
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

# Step 5: Train SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)  # Linear kernel for text classification
svm_model.fit(X_train_tfidf, y_train)

In [22]:
# Step 6: Evaluate the SVM model
y_pred = svm_model.predict(X_test_tfidf)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


SVM Model Accuracy: 52.50%

Classification Report:
                                   precision    recall  f1-score   support

  satisfied considerable comment       0.00      0.00      0.00        51
       satisfied neutral comment       0.54      0.95      0.69       110
unsatisfied considerable comment       0.00      0.00      0.00        14
     unsatisfied neutral comment       1.00      0.04      0.08        25

                        accuracy                           0.53       200
                       macro avg       0.39      0.25      0.19       200
                    weighted avg       0.42      0.53      0.39       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Step 7: Save vectorizer and model for future use
import joblib

joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(svm_model, 'svm_sentiment_model.pkl')

print("Model and vectorizer saved successfully.")