In [1]:
import pandas as pd 
import re 
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words_en = stopwords.words('English')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import plotly_express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

In [2]:

# Read the CSV file
df = pd.read_csv('DisneylandReviews.csv', encoding='ISO-8859-1')

# Filter out reviews with a rating of 5
df = df[df['Rating'] != 5]

# Define the sentiment function
def sentiment(score):
    if score == 4:
        return 'Good'
    elif score <= 3:
        return 'Bad'

# Apply the sentiment function to create a 'Sentiment' column
df['Sentiment'] = df['Rating'].apply(sentiment)

# Select relevant columns
df = df[['Review_Text', 'Branch', 'Sentiment']]

# Define the limpiar function
def limpiar(texto):
    res = texto.lower()
    res = re.sub(r'[^a-zA-Z0-9\s]', '', res)
    res = word_tokenize(res)
    res = [lemmatizer.lemmatize(token) for token in res]
    res = [token for token in res if token not in stop_words_en]
    res = ' '.join(res)
    return res

# Apply the limpiar function to create a 'Texto_limpio' column
df['Texto_limpio'] = df['Review_Text'].apply(limpiar)


In [3]:
df.head()

Unnamed: 0,Review_Text,Branch,Sentiment,Texto_limpio
0,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,Good,youve ever disneyland anywhere youll find disn...
1,Its been a while since d last time we visit HK...,Disneyland_HongKong,Good,since last time visit hk disneyland yet time s...
2,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,Good,thanks god hot humid wa visiting park otherwis...
3,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,Good,hk disneyland great compact park unfortunately...
4,"the location is not in the city, took around 1...",Disneyland_HongKong,Good,location city took around 1 hour kowlon kid li...


In [4]:

from sklearn.model_selection import train_test_split
X = df['Texto_limpio']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=101)


In [5]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the pipeline with TF-IDF vectorizer and SVM classifier
model_SVM = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,1))),
    ("model", SVC(C=2, kernel='rbf', probability=True))
])

# Fit the TF-IDF vectorizer on the training data and transform the training data
X_train_tfidf = model_SVM.named_steps['tfidf'].fit_transform(X_train)


In [6]:
# Fit the model on the transformed training data
model_SVM.fit(X_train, y_train)

In [7]:
# Make predictions on the test set by transforming it with the fitted vectorizer
X_test_tfidf = model_SVM.named_steps['tfidf'].transform(X_test)
y_pred_model = model_SVM.named_steps['model'].predict(X_test_tfidf)


In [8]:
# Print the classification report
print("Classification Report for model_SVM:")
print(classification_report(y_test, y_pred_model))

Classification Report for model_SVM:
              precision    recall  f1-score   support

         Bad       0.78      0.71      0.74      2598
        Good       0.78      0.84      0.81      3255

    accuracy                           0.78      5853
   macro avg       0.78      0.77      0.78      5853
weighted avg       0.78      0.78      0.78      5853



In [9]:
import joblib

joblib.dump(model_SVM, 'ModeloSVC.joblib')

['ModeloSVC.joblib']