In [1]:
import numpy as np 
import pandas as pd 

import nltk
import re
import string

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Loading the dataset

In [2]:
# loading the data set
dataset = pd.read_csv("/kaggle/input/americantrucksimulator/AmericanTruckSimulator.csv")


dataset.count()

Unnamed: 0                        24302
recommendationid                  24302
language                          24302
review                            24236
timestamp_created                 24302
timestamp_updated                 24302
voted_up                          24302
votes_up                          24302
votes_funny                       24302
weighted_vote_score               24302
comment_count                     24302
steam_purchase                    24302
received_for_free                 24302
written_during_early_access       24302
author.steamid                    24302
author.num_games_owned            24302
author.num_reviews                24302
author.playtime_forever           24302
author.playtime_last_two_weeks    24302
author.playtime_at_review         24277
author.last_played                24302
timestamp_dev_responded               2
developer_response                    2
dtype: int64

# Only use the 6000+ rows out of 24000+

In [3]:
# Count how may true and false row in voted_up col

dataset['voted_up'] = dataset['voted_up'].astype(str)

print("True: ",sum(dataset['voted_up'] == 'True'))
print("False: ",sum(dataset['voted_up'] == 'False'))

True:  23343
False:  959


In [4]:
# Only use 6K+ col with 5500+ for true and 900+ for false based on voted_up col


true = dataset[dataset['voted_up'] == 'True'].sample(n=5500, random_state=2)
false = dataset[dataset['voted_up'] == 'False'].sample(n=959)

data_6K= pd.concat([true, false])

print("Number of rows: ", data_6K.shape[0])


Number of rows:  6459


#                                                           TEXT NORMALIZATION

## Data Cleaning

In [5]:
# dropping those with empty reviews
data_6K = data_6K.dropna(subset=['review'])

# Convert the 'review' column to string
data_6K.loc[:, 'review'] = data_6K['review'].astype(str)


# Convert 'review' column to lowercase
data_6K.loc[:, 'review'] = data_6K.loc[:, 'review'].str.lower()



print(data_6K.loc[:, 'review'].head(60))

5159                               it is a relly good game
20828    i have quite a bit of history with truck drivi...
12206                                                   ;]
516                                                  great
754      this game is amazing, easily moddable, awesome...
16425              looks amazing and it feels like oregon.
11846                               best game for relaxing
11656                            awesome game and is cheap
3364     i absolutely love the game. from customization...
2158                             very fun to drive around.
1411     if you like simulators then you will definitel...
18028    it surprises me how soothing a game like this ...
13313                                         amazing game
9608     for those people who loves the thought of bein...
5521                       it is fun ! and always updates.
21294    played as an afro-american but didn´t get shot...
22374    fantastic  \n\nsure theres alot missing but it.

In [6]:
# No of rows after removing the rows with empty reviews
print("Number of rows: ", data_6K.shape[0])

Number of rows:  6446


### Expanding Contractions

In [7]:
# Expanding contractions

CONTRACTION_MAP = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub('"', '', expanded_text)
    return expanded_text

# Expand contractions
data_6K.loc[:, 'review'] = data_6K.loc[:, 'review'].apply(expand_contractions)

print(data_6K.loc[:, 'review'].head(60))

5159                               it is a relly good game
20828    i have quite a bit of history with truck drivi...
12206                                                   ;]
516                                                  great
754      this game is amazing, easily moddable, awesome...
16425              looks amazing and it feels like oregon.
11846                               best game for relaxing
11656                            awesome game and is cheap
3364     i absolutely love the game. from customization...
2158                             very fun to drive around.
1411     if you like simulators then you will definitel...
18028    it surprises me how soothing a game like this ...
13313                                         amazing game
9608     for those people who loves the thought of bein...
5521                       it is fun ! and always updates.
21294    played as an afro-american but didn´t get shot...
22374    fantastic  \n\nsure theres alot missing but it.

### Removing Punctations

In [8]:
# Removing punctations

def remove_punct(my_str):
    # define punctuation
    punctuations = '''!()-[]{};:'.'"\,<>./?@#$%^-&_0123456789~*'''
    
    # remove punctuation from the string
    no_punct = ""  
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
        
    return no_punct


# Apply strip to remove leading and trailing spaces from each string in the ‘review’ column
data_6K.loc[:, 'review'] = data_6K.loc[:, 'review'].str.strip()


# Remove punctuation
data_6K.loc[:, 'review'] = data_6K['review'].apply(remove_punct)





print(data_6K.loc[:, 'review'].head(60))

5159                               it is a relly good game
20828    i have quite a bit of history with truck drivi...
12206                                                     
516                                                  great
754      this game is amazing easily moddable awesome t...
16425               looks amazing and it feels like oregon
11846                               best game for relaxing
11656                            awesome game and is cheap
3364     i absolutely love the game from customization ...
2158                              very fun to drive around
1411     if you like simulators then you will definitel...
18028    it surprises me how soothing a game like this ...
13313                                         amazing game
9608     for those people who loves the thought of bein...
5521                         it is fun  and always updates
21294    played as an afroamerican but didn´t get shot ...
22374    fantastic  \n\nsure theres alot missing but it.

### Remove Repeating Characters

In [9]:
# Removing repeating characters
    
def remove_repeating_chars(s):
    return re.sub(r'(.)\1+', r'\1', s)

# Assuming df is your DataFrame and 'column' is the name of the column containing the strings
data_6K.loc[:, 'review'] = data_6K.loc[:, 'review'].apply(remove_repeating_chars)

print(data_6K.loc[:, 'review'].head(60))

5159                                 it is a rely god game
20828    i have quite a bit of history with truck drivi...
12206                                                     
516                                                  great
754      this game is amazing easily modable awesome to...
16425                 loks amazing and it fels like oregon
11846                               best game for relaxing
11656                            awesome game and is cheap
3364     i absolutely love the game from customization ...
2158                              very fun to drive around
1411     if you like simulators then you wil definitely...
18028    it surprises me how sothing a game like this c...
13313                                         amazing game
9608     for those people who loves the thought of bein...
5521                          it is fun and always updates
21294    played as an afroamerican but didn´t get shot ...
22374    fantastic \n\nsure theres alot mising but it j.

### Removing Stopwords

In [10]:
# Stopwords file
file_path = '/kaggle/input/stopwords/stopwords.txt'

# Open the file and read the lines
with open(file_path, 'r') as f:
    stop_words = f.readlines()

# Remove any newline characters
stop_words = [word.strip() for word in stop_words]


def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


# Apply the function to the data set
data_6K.loc[:, 'review'] = data_6K.loc[:, 'review'].apply(remove_stopwords)


print(data_6K.loc[:, 'review'].head(60))

5159                                         rely god game
20828    bit history truck driving simulators days scs ...
12206                                                     
516                                                  great
754               game amazing easily modable awesome play
16425                        loks amazing fels like oregon
11846                                   best game relaxing
11656                                   awesome game cheap
3364     absolutely love game customization options tru...
2158                                             fun drive
1411              like simulators wil definitely enjoy sim
18028                surprises sothing game like can times
13313                                         amazing game
9608     people who loves thought being truck driver so...
5521                                           fun updates
21294    played afroamerican didn´t shot police lack re...
22374    fantastic sure theres alot mising just came be.

## Tokenization

In [11]:
def tokenizeText (text) :
    word_tokens = nltk.word_tokenize(text)
    return word_tokens

# Tokenize the dataset
data_6K.loc[:, 'tokens'] = data_6K['review'].apply(tokenizeText)

print(data_6K.loc[:, 'tokens'].head(60))

5159                                     [rely, god, game]
20828    [bit, history, truck, driving, simulators, day...
12206                                                   []
516                                                [great]
754        [game, amazing, easily, modable, awesome, play]
16425                  [loks, amazing, fels, like, oregon]
11846                               [best, game, relaxing]
11656                               [awesome, game, cheap]
3364     [absolutely, love, game, customization, option...
2158                                          [fun, drive]
1411       [like, simulators, wil, definitely, enjoy, sim]
18028         [surprises, sothing, game, like, can, times]
13313                                      [amazing, game]
9608     [people, who, loves, thought, being, truck, dr...
5521                                        [fun, updates]
21294    [played, afroamerican, didn´t, shot, police, l...
22374    [fantastic, sure, theres, alot, mising, just, .

### Stemming

In [12]:
# Initialize the stemmer
porter_stemmer = PorterStemmer()


# Stem each word in the tokenized sentence
data_6K['stemmed_tokens'] = data_6K['tokens'].apply(lambda x : [porter_stemmer.stem(y) for y in x])

print(data_6K['stemmed_tokens'].head(60))


5159                                     [reli, god, game]
20828    [bit, histori, truck, drive, simul, day, sc, s...
12206                                                   []
516                                                [great]
754             [game, amaz, easili, modabl, awesom, play]
16425                       [lok, amaz, fel, like, oregon]
11846                                  [best, game, relax]
11656                                [awesom, game, cheap]
3364     [absolut, love, game, custom, option, truck, s...
2158                                          [fun, drive]
1411               [like, simul, wil, definit, enjoy, sim]
18028               [surpris, soth, game, like, can, time]
13313                                         [amaz, game]
9608     [peopl, who, love, thought, be, truck, driver,...
5521                                          [fun, updat]
21294    [play, afroamerican, didn´t, shot, polic, lack...
22374    [fantast, sure, there, alot, mise, just, came,.

# TEXT CLASSIFICATION

In [13]:
# Splitting the dataset into train and test

reviews = data_6K['stemmed_tokens'].values
y = data_6K['voted_up'].values


# Split 30% for test set, use stratified for same proportion of true and false from train and test
reviews_train, reviews_test, y_train, y_test = train_test_split(reviews,y,test_size=0.30,stratify=y,random_state=101)

In [14]:
# Join the tokens into a single string for each document
reviews_train = [' '.join(doc) for doc in reviews_train]
reviews_test = [' '.join(doc) for doc in reviews_test]


In [15]:
# Count Vectorizer
vectorizer = CountVectorizer(min_df=3)
vectorizer.fit(reviews_train)

# Make a matrix of words and their counts
X_train = vectorizer.transform(reviews_train)
X_test = vectorizer.transform(reviews_test)

In [16]:
# Number of rows for traing and test sets
num_rows = pd.DataFrame({
    'Data Set': ['Training Data', 'Testing Data'],
    'Number of Rows': [X_train.shape[0], X_test.shape[0]]
})

num_rows

Unnamed: 0,Data Set,Number of Rows
0,Training Data,4512
1,Testing Data,1934


In [17]:
# Get the features and count the words

word_list = vectorizer.get_feature_names_out()
count_list = np.asarray(X_train.sum(axis=0))[0]

# Create a DataFrame from the feature(reviews) and counts
counts = pd.DataFrame({'Word': word_list, 'Count': count_list})

# Sort the DataFrame by 'Count' in descending order
counts = counts.sort_values('Count', ascending=False)

counts

Unnamed: 0,Word,Count
882,game,5626
2226,truck,2837
1234,like,1294
1577,play,1228
616,drive,1180
...,...,...
1965,skipe,3
733,expert,3
1425,nich,3
743,extremli,3


In [18]:
# Using RandomUnderSampler to handle imbalance of class
# Using Random forest to train the data with balanced class weight

classifier = Pipeline([
    ('RandomUnderSampler', RandomUnderSampler()),
    ('clf', RandomForestClassifier(class_weight='balanced'))
])

# Fit the model to your training data
classifier.fit(X_train, y_train)

In [19]:
# Prediction & Performance
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'],colnames =['Predicted'], margins = True)

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,241,47,288
True,388,1258,1646
All,629,1305,1934


In [20]:
# Accuracy

score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.7750775594622544


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.38      0.84      0.53       288
        True       0.96      0.76      0.85      1646

    accuracy                           0.78      1934
   macro avg       0.67      0.80      0.69      1934
weighted avg       0.88      0.78      0.80      1934



# End of codes...