In [12]:
import pandas as pd 
import re 
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib

# Load dataset
df = pd.read_csv('DisneylandReviews.csv', encoding='ISO-8859-1')
df = df[df['Rating'] != 5]

# Define sentiment function
def sentiment(score):
    if score == 4:
        return 'Good'
    elif score <= 3:
        return 'Bad'

df['Sentiment'] = df['Rating'].apply(sentiment)

# Select relevant columns
df = df[['Review_Text', 'Branch', 'Sentiment','Reviewer_Location','Year_Month']]
new = df["Year_Month"].str.split("-", n = 1, expand = True)
df["Year"]= new[0]
df["Month"]= new[1]
# Preprocess text
stop_words_en = stopwords.words('English')
lemmatizer = WordNetLemmatizer()
stop_words_en.append('wa')
stop_words_en.append('br')
stop_words_en.append('ha')


In [13]:

def limpiar(texto):
    res = texto.lower() #Hace en minusculas
    res = re.sub(r'[^a-zA-Z0-9\s]', '', res) #Seleccionas signos de puntuacion, y simbolos que no son alfanumericos
    res = word_tokenize(res) #Tokeniza el resultado
    res = [lemmatizer.lemmatize(token) for token in res] #Lematiza todo
    res = [token for token in res if token not in stop_words_en] #Quitas todas las stopwords y lo guardas en token
    res = ' '.join(res) #Como lemmatizer devuelve tupla, se debe de hacer un join
    return res

df['Texto_limpio'] = df['Review_Text'].apply(limpiar)
df.to_csv('df_Binario.csv', index=False)

In [14]:
df.head(5)

Unnamed: 0,Review_Text,Branch,Sentiment,Reviewer_Location,Year_Month,Year,Month,Texto_limpio
0,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,Good,Australia,2019-4,2019,4,youve ever disneyland anywhere youll find disn...
1,Its been a while since d last time we visit HK...,Disneyland_HongKong,Good,Philippines,2019-5,2019,5,since last time visit hk disneyland yet time s...
2,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,Good,United Arab Emirates,2019-4,2019,4,thanks god hot humid visiting park otherwise w...
3,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,Good,Australia,2019-4,2019,4,hk disneyland great compact park unfortunately...
4,"the location is not in the city, took around 1...",Disneyland_HongKong,Good,United Kingdom,2019-4,2019,4,location city took around 1 hour kowlon kid li...


In [None]:
# Split data
X = df['Texto_limpio']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=101)

# Vectorize text
tfidf = TfidfVectorizer(ngram_range=(1, 3))
X_train_vectorized = tfidf.fit_transform(X_train)


In [None]:

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

#Classification report
from sklearn.metrics import classification_report, confusion_matrix
X_test_vectorized = tfidf.transform(X_test)
y_pred = model.predict(X_test_vectorized)
# Print the classification report
print("Classification Report for model_SVM:")
print(classification_report(y_test,y_pred))




Classification Report for model_SVM:
              precision    recall  f1-score   support

         Bad       0.77      0.70      0.74      2598
        Good       0.78      0.83      0.80      3255

    accuracy                           0.78      5853
   macro avg       0.77      0.77      0.77      5853
weighted avg       0.78      0.78      0.77      5853



In [None]:
# Save the model and vectorizer
joblib.dump(model, 'RegresionL3.joblib')
joblib.dump(tfidf, 'tfidf_vectorizer3.joblib')


['tfidf_vectorizer3.joblib']