In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from scipy.sparse import hstack
from sklearn.metrics import f1_score, classification_report, accuracy_score


data = pd.read_csv("b_train.csv", encoding='latin1')
print(data)



       S.No.                                             Review  Rating
0          1  nice hotel expensive parking got good deal sta...       4
1          2  ok nothing special charge diamond member hilto...       2
2          3  nice rooms not 4* experience hotel monaco seat...       3
3          4  unique \tgreat stay \twonderful time hotel mon...       5
4          5  great stay great stay \twent seahawk game awes...       5
...      ...                                                ...     ...
18434  18435  gem middle venice corte grimani travel advisor...       5
18435  18436  terrific value residence corte grimani happy a...       5
18436  18437  true venice gem family just returned trip incl...       5
18437  18438  really best \tthanks trip advisor \tthanks fel...       5
18438  18439  true gem \tclean modern apartment brand new ba...       5

[18439 rows x 3 columns]


In [2]:

hotel_df = pd.DataFrame(data)
#What does the regular expression r'w{1,} do?
#count_vect = CountVectorizer(analyzer='word')
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')

#"fit" is actually building the vocabulary
#count_vect.fit(pd.concat((df_train['question1'],df_train['question2'])).unique())
count_vect.fit(hotel_df["Review"].unique())


#the transformation converts the text to bag-of-words
trainq1_trans = count_vect.transform(hotel_df['Review'].values)  #these are sparse BOW matrices for Q1

#extract the target
labels = hotel_df['Rating'].values

#this stacks both questions next to each other
X = trainq1_trans
y = labels


In [3]:
print(X)
print(y)

  (0, 1259)	1
  (0, 2743)	1
  (0, 2783)	1
  (0, 3838)	1
  (0, 4513)	1
  (0, 5103)	1
  (0, 5577)	1
  (0, 5877)	1
  (0, 6187)	1
  (0, 9686)	1
  (0, 10309)	1
  (0, 10459)	1
  (0, 10856)	1
  (0, 13036)	1
  (0, 13894)	2
  (0, 14141)	1
  (0, 14441)	1
  (0, 14764)	1
  (0, 15400)	1
  (0, 16603)	1
  (0, 16968)	1
  (0, 17039)	1
  (0, 17064)	1
  (0, 20077)	1
  (0, 20112)	1
  :	:
  (18438, 24104)	1
  (18438, 24929)	1
  (18438, 25071)	1
  (18438, 26377)	2
  (18438, 27498)	1
  (18438, 28710)	1
  (18438, 29852)	1
  (18438, 29946)	1
  (18438, 29994)	1
  (18438, 30259)	1
  (18438, 31711)	1
  (18438, 32189)	1
  (18438, 32389)	1
  (18438, 34217)	1
  (18438, 37080)	1
  (18438, 37464)	1
  (18438, 41304)	1
  (18438, 41585)	1
  (18438, 42305)	1
  (18438, 44419)	1
  (18438, 45107)	1
  (18438, 45225)	2
  (18438, 45400)	1
  (18438, 46822)	1
  (18438, 47837)	1
[4 2 3 ... 5 5 5]


In [4]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [5]:
len(count_vect.get_feature_names_out())

49280

In [6]:
print(X.toarray().shape)

(18439, 49280)


In [7]:
print(count_vect.vocabulary_)



In [8]:
count_vect.vocabulary_['do']

14560

In [19]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.2, shuffle = True, random_state = 42)
#classifier = LogisticRegression(solver='lbfgs', max_iter=10000)    #change the parameters, try other classifiers 

classifier = svm.SVC(kernel='rbf', C=1.0, gamma='scale')

#train the model
classifier.fit(X_train, y_train)

#check prediction on validation dataset
prediction = classifier.predict(X_valid)

print('training score:', f1_score(y_train, classifier.predict(X_train), average='macro'))
print('validation score:', f1_score(y_valid, classifier.predict(X_valid), average='macro'))
print(classification_report(y_valid, prediction))

training score: 0.837185191180461
validation score: 0.5056255355185133
              precision    recall  f1-score   support

           1       0.70      0.53      0.60       249
           2       0.44      0.39      0.41       333
           3       0.47      0.17      0.25       385
           4       0.52      0.50      0.51      1143
           5       0.68      0.84      0.75      1578

    accuracy                           0.60      3688
   macro avg       0.56      0.49      0.51      3688
weighted avg       0.59      0.60      0.58      3688



In [20]:
df_test = pd.read_csv('b_test.csv',  encoding='latin1')
hotel_df_test = pd.DataFrame(df_test)

labels = hotel_df_test['Rating'].values

testq1_trans = count_vect.transform(hotel_df_test['Review'].values)  #these are sparse BOW matrices for Q1

X_test = testq1_trans #join the questions together (keeps only the common terms)
y_test = labels

In [21]:
prediction1 = classifier.predict(X_test)
print('training score:', f1_score(y_train, classifier.predict(X_train), average='macro'))
print('validation score:', f1_score(y_valid, classifier.predict(X_valid), average='macro'))
print('test score:', f1_score(y_test, classifier.predict(X_test), average='macro'))
print(classification_report(y_test, prediction1))

training score: 0.837185191180461
validation score: 0.5056255355185133
test score: 0.4726856919482471
              precision    recall  f1-score   support

           1       0.71      0.48      0.57       114
           2       0.42      0.37      0.39       166
           3       0.37      0.10      0.16       204
           4       0.48      0.47      0.47       586
           5       0.70      0.85      0.77       981

    accuracy                           0.61      2051
   macro avg       0.53      0.45      0.47      2051
weighted avg       0.58      0.61      0.58      2051



In [22]:
#classifier = LogisticRegression(solver='lbfgs', max_iter=10000, penalty='l2', C=0.5)    #change the parameters, try other classifiers 
# declare the random forest classifier with hyperparameters
#classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

#classifier = LogisticRegression(solver='lbfgs', max_iter=10000) 


#classifier = SVC(kernel='rbf', C=1.0, gamma='scale')

#classifier = SVC(kernel='linear', C=1.0)

#classifier = SVC(kernel='poly', degree=3, C=1.0)

# Increase regularization strength (C=0.1)
#classifier = SVC(kernel='rbf', C=0.1)

# Decrease regularization strength (C=10)
#classifier = SVC(kernel='rbf', C=10)

#classifier = SVC(kernel='rbf', class_weight='balanced', gamma=0.1, C = 10)

#classifier = DecisionTreeClassifier(criterion='gini', max_depth=None)

#classifier = GaussianNB()
#classifier = KNeighborsClassifier(n_neighbors=5, weights='uniform')
#classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
#classifier = RandomForestClassifier(n_estimators=100, criterion='gini')


#classifier = SVC(kernel='rbf', C=1.0, gamma='scale')

#classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=10000) a

#classifier = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000) b
#classifier = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga', max_iter=10000) c

#classifier = LogisticRegression(penalty='l2', C=0.1, solver='lbfgs', max_iter=10000)