In [1]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import torch
print("GPU Available :",torch.cuda.is_available())

GPU Available : True


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step 1 :- Import Data file

In [7]:
train_data=pd.read_csv("/content/drive/My Drive/Colab Notebooks/datasets/game_review/train.csv");
test_data=pd.read_csv("/content/drive/My Drive/Colab Notebooks/datasets/game_review/test.csv");

In [None]:
train_data.columns

Index(['review_id', 'title', 'year', 'user_review', 'user_suggestion'], dtype='object')

In [None]:
train_data.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,460,Black Squad,2018.0,"Early Access ReviewVery great shooter, that ha...",1
1,2166,Tree of Savior (English Ver.),2016.0,I love love love playing this game!Super 100%!...,1
2,17242,Eternal Card Game,2016.0,Early Access ReviewAs a fan of MTG and Hearths...,1
3,6959,Tactical Monsters Rumble Arena,2018.0,Turn based strategy game similiar to FF Tactic...,1
4,8807,Yu-Gi-Oh! Duel Links,2017.0,This game has an insanely huge download for be...,0


# Step 2 :- text preprocessing
explicit text preprocessing (like using Spacy for tokenization, lemmatization, stopword removal, etc.) is NOT required when using Sentence Transformers (SBERT).

In [8]:
def clean_text(text):
    text=text.lower()
    text=re.sub(r"[^a-zA-Z0-9\s]","",text)
    text=text.strip()
    return text;

In [9]:
train_data['user_review']=train_data.user_review.apply(lambda x: clean_text(x));
test_data['user_review']=test_data.user_review.apply(lambda x: clean_text(x));

In [None]:
train_data['user_review']

0        early access reviewvery great shooter that hav...
1        i love love love playing this gamesuper 100it ...
2        early access reviewas a fan of mtg and hearths...
3        turn based strategy game similiar to ff tactic...
4        this game has an insanely huge download for be...
                               ...                        
17872    early access reviewan interesting game but wha...
17873    early access review spend 10 minutes queueing ...
17874    product received for freeearly access reviewga...
17875    for three days i was completely entranced by t...
17876    i hate clickers such a waste of timethis one h...
Name: user_review, Length: 17877, dtype: object

# Step 3 :- Text Embedding

In [None]:
emd_model= SentenceTransformer("all-MiniLM-L6-v2");

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
train_embeddings=emd_model.encode(train_data["user_review"].tolist())
test_embeddings=emd_model.encode(test_data["user_review"].tolist())

In [None]:
embedding_df=pd.DataFrame(train_embeddings)
print(embedding_df.head())

        0         1         2    ...       381       382       383
0 -0.077395  0.003952 -0.016869  ... -0.084005 -0.038629  0.025998
1 -0.088768  0.040498  0.018548  ...  0.041017 -0.052115 -0.026479
2 -0.048850  0.040540  0.037724  ... -0.001646 -0.063688  0.137934
3 -0.013939 -0.041074 -0.021469  ... -0.000680 -0.056809  0.100176
4 -0.027199  0.024948  0.034624  ...  0.016889  0.021933  0.070286

[5 rows x 384 columns]


# Step 4:- Train dataset and train the model

In [None]:
# we already have separate train and testing data frame so we don't need it.
classifier_model=LogisticRegression(max_iter=1000)

In [None]:
classifier_model.fit(train_embeddings,train_data["user_suggestion"])

In [None]:
y_pred=classifier_model.predict(test_embeddings);
accuracy=accuracy_score(test_data["user_suggestion"],y_pred)
print(f"Classification Accuracy : {accuracy:.4f}")

Classification Accuracy : 0.8222


# Step 5 :- check on most related user review

In [None]:
from sentence_transformers import util

In [None]:
new_review = "This game is really amazing! Great experience."
new_review=clean_text(new_review)
new_review

'this game is really amazing great experience'

In [None]:
new_embedding = emd_model.encode(new_review)
new_embedding

array([-3.32151569e-04,  7.05640689e-02,  3.40685770e-02, -2.78030839e-02,
       -4.05711085e-02,  1.70998182e-02,  1.47868099e-03, -1.13110626e-02,
       -9.57221165e-03,  3.96620594e-02, -2.00001821e-02, -1.56284333e-03,
        3.20565887e-02, -1.65587384e-02,  1.12410439e-02,  3.86196151e-02,
        2.03488395e-02, -3.56495231e-02, -1.49202335e-03, -8.41762722e-02,
        1.70920920e-02, -6.23048358e-02,  1.45161161e-02, -2.94645745e-02,
       -4.66698967e-02,  5.97898401e-02,  4.34178207e-03,  3.38528939e-02,
        3.97578180e-02, -7.75188580e-02, -2.18299106e-02,  7.09127486e-02,
        2.09059119e-02,  3.72006325e-03, -4.02477607e-02,  9.17856991e-02,
       -2.15168651e-02, -7.51098916e-02, -4.45931628e-02, -2.09373478e-02,
       -4.13586162e-02, -2.16985587e-02,  3.50636952e-02,  3.65318428e-03,
        6.16212282e-03,  4.08808924e-02, -3.03303655e-02, -1.01668630e-02,
        1.08255580e-01, -2.81518395e-03, -6.37972378e-04, -6.16515540e-02,
        4.98988852e-02, -

In [None]:
similarities = util.cos_sim(new_embedding,train_embeddings)
similarities

tensor([[0.3125, 0.5333, 0.4555,  ..., 0.3198, 0.4772, 0.2238]])

In [None]:
most_similar_idx=np.argmax(similarities)
most_similar_idx

tensor(13727)

In [None]:
most_similar_review = train_data.iloc[most_similar_idx.item()]["user_review"]
print(f"New Review : {new_review}")
print(f"Most Similar Review : {most_similar_review}")

New Review : this game is really amazing great experience
Most Similar Review : i have been playing this game for a little over 2 years now and just watching it progress is amazing this game is very fun and interactive especially when you use curse with the game it creates that much more of an experience


# Step 6 :- More powerful embedding tool and classification model

In [10]:
from xgboost import XGBClassifier

__The XGBoost (Extreme Gradient Boosting) Classifier is one of the most powerful machine learning algorithms for classification tasks. It is an optimized version of Gradient Boosting, widely used in Kaggle competitions, industry applications, and AI research.__

In [11]:
emb_model2=SentenceTransformer("paraphrase-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
train_embeddings2=emb_model2.encode(train_data["user_review"])
text_embeddings2=emb_model2.encode(test_data["user_review"])

In [13]:
classifier_model2=XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1)

In [16]:
classifier_model2.fit(train_embeddings2,train_data["user_suggestion"])

In [17]:
y_pred2=classifier_model2.predict(text_embeddings2);
accuracy2=accuracy_score(test_data["user_suggestion"],y_pred2)
print(f"Classification Accuracy : {accuracy2:.4f}")

Classification Accuracy : 0.8700
