In [46]:
#Import Dependencies
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn import metrics



#Tokenized & Lemmatized data
data = pd.read_csv('../Trip Advisor Preprocessing/data_tokenized_Isak.csv')

In [47]:
data['ratings.overall'] = data['ratings.overall'].replace(range(0, 3), 'aBad')
data['ratings.overall'] = data['ratings.overall'].replace(3, 'bNeutral')
data['ratings.overall'] = data['ratings.overall'].replace(range(4, 6), 'cGood')

result = data.groupby('ratings.overall').size()

result

ratings.overall
aBad         1581
bNeutral     1946
cGood       19945
dtype: int64

In [48]:
# Extract preprocessed data and labels
preprocessed_data = data['text'].tolist()
labels = data['ratings.overall'].tolist()


In [49]:
X_train, X_rem, y_train, y_rem = train_test_split(preprocessed_data,labels,train_size=0.75,stratify=labels,random_state=42) #75% train data
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5) #12.5% in test and 12.5% in validation

In [50]:
# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),  # Set with_mean=False for sparse matrices
    ('mlp', MLPClassifier()),
    
])

In [51]:
# Define parameters for grid search
parameters = {
    'mlp__activation': ['relu', 'tanh'],
    'mlp__alpha': [0.0001, 0.001, 0.01],
        'mlp__hidden_layer_sizes': [(50,), (100,), (200,), (300,)]
}

In [52]:
# Create grid search object
grid_search = GridSearchCV(pipeline, parameters, scoring='f1_weighted', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('scaler',
                                        StandardScaler(with_mean=False)),
                                       ('mlp', MLPClassifier())]),
             n_jobs=-1,
             param_grid={'mlp__activation': ['relu', 'tanh'],
                         'mlp__alpha': [0.0001, 0.001, 0.01],
                         'mlp__hidden_layer_sizes': [(50,), (100,), (200,),
                                                     (300,)]},
             scoring='f1_weighted')

In [53]:
# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_valid)

#target_names = ['Rating 1', 'Rating 2', 'Rating 3', 'Rating 4', 'Rating 5']
target_names = ['Negative', 'Netural', 'Positive']
print(classification_report(y_valid, y_pred_val, target_names=target_names))

              precision    recall  f1-score   support

    Negative       0.64      0.53      0.58       199
     Netural       0.25      0.22      0.23       241
    Positive       0.91      0.94      0.93      2494

    accuracy                           0.85      2934
   macro avg       0.60      0.56      0.58      2934
weighted avg       0.84      0.85      0.85      2934



In [54]:
# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_valid)


print(classification_report(y_valid, y_pred_val, target_names=target_names))

              precision    recall  f1-score   support

    Negative       0.64      0.53      0.58       199
     Netural       0.25      0.22      0.23       241
    Positive       0.91      0.94      0.93      2494

    accuracy                           0.85      2934
   macro avg       0.60      0.56      0.58      2934
weighted avg       0.84      0.85      0.85      2934



In [55]:
print(grid_search.best_params_)

{'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (300,)}


In [56]:
#FINAL SCORING ON TEST DATA

# Evaluate the best model on the val data
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

print("Best model used: ", best_model)

# Train accuracy
y_pred_train = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

# Test accuracy
y_pred_test = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", test_accuracy)

print(classification_report(y_test, y_pred_test, target_names=target_names))

Best model used:  Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('scaler', StandardScaler(with_mean=False)),
                ('mlp', MLPClassifier(alpha=0.01, hidden_layer_sizes=(300,)))])
Training Accuracy: 0.9994887525562373
Test Accuracy: 0.854464894342195
              precision    recall  f1-score   support

    Negative       0.60      0.51      0.55       196
     Netural       0.27      0.22      0.24       246
    Positive       0.92      0.94      0.93      2492

    accuracy                           0.85      2934
   macro avg       0.59      0.56      0.57      2934
weighted avg       0.84      0.85      0.85      2934



# Manual Tuning for Hidden Layers

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

# Count vectorize the text data
count_vectorizer = CountVectorizer()
text_vectorized = count_vectorizer.fit_transform(data['text'])

In [58]:
# Split the count vectorized data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(text_vectorized, data['ratings.overall'], train_size=0.75, stratify=data['ratings.overall'])

In [59]:
#Nested For-loop from Lab 2.
for h1 in range(10, 210, 50):
    for h2 in range(10, 210, 50):
        mlp_clf = MLPClassifier(hidden_layer_sizes=(h1, h2), max_iter=1000).fit(X_train, y_train)
        print(f"Hidden layer sizes={h1, h2}, Train accuracy={metrics.accuracy_score(y_train, mlp_clf.predict(X_train))}")
        print(f"Hidden layer sizes={h1, h2}, Test accuracy={metrics.accuracy_score(y_test, mlp_clf.predict(X_test))}\n")

Hidden layer sizes=(10, 10), Train accuracy=0.9997727789138832
Hidden layer sizes=(10, 10), Test accuracy=0.8505453306066802

Hidden layer sizes=(10, 60), Train accuracy=0.9991479209270621
Hidden layer sizes=(10, 60), Test accuracy=0.8563394683026585

Hidden layer sizes=(10, 110), Train accuracy=0.9999431947284708
Hidden layer sizes=(10, 110), Test accuracy=0.8602590320381731

Hidden layer sizes=(10, 160), Train accuracy=0.9996591683708248
Hidden layer sizes=(10, 160), Test accuracy=0.8573619631901841

Hidden layer sizes=(60, 10), Train accuracy=0.9999431947284708
Hidden layer sizes=(60, 10), Test accuracy=0.8698023176550784

Hidden layer sizes=(60, 60), Train accuracy=0.9999431947284708
Hidden layer sizes=(60, 60), Test accuracy=0.878323108384458

Hidden layer sizes=(60, 110), Train accuracy=0.9999431947284708
Hidden layer sizes=(60, 110), Test accuracy=0.8740627130197682

Hidden layer sizes=(60, 160), Train accuracy=0.9998863894569416
Hidden layer sizes=(60, 160), Test accuracy=0.884