In [1]:
import numpy as np 
import pandas as pd 
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading the dataset

In [2]:
# loading the data set
dataset = pd.read_csv("/kaggle/input/americantrucksimulator/AmericanTruckSimulator.csv")


dataset.count()

Unnamed: 0                        24302
recommendationid                  24302
language                          24302
review                            24236
timestamp_created                 24302
timestamp_updated                 24302
voted_up                          24302
votes_up                          24302
votes_funny                       24302
weighted_vote_score               24302
comment_count                     24302
steam_purchase                    24302
received_for_free                 24302
written_during_early_access       24302
author.steamid                    24302
author.num_games_owned            24302
author.num_reviews                24302
author.playtime_forever           24302
author.playtime_last_two_weeks    24302
author.playtime_at_review         24277
author.last_played                24302
timestamp_dev_responded               2
developer_response                    2
dtype: int64

# Only use the first 8000 rows out of 2400+

In [3]:
#Using only the first 8000 rows
data_8K = dataset.head(8000)


data_8K.count()

Unnamed: 0                        8000
recommendationid                  8000
language                          8000
review                            7968
timestamp_created                 8000
timestamp_updated                 8000
voted_up                          8000
votes_up                          8000
votes_funny                       8000
weighted_vote_score               8000
comment_count                     8000
steam_purchase                    8000
received_for_free                 8000
written_during_early_access       8000
author.steamid                    8000
author.num_games_owned            8000
author.num_reviews                8000
author.playtime_forever           8000
author.playtime_last_two_weeks    8000
author.playtime_at_review         8000
author.last_played                8000
timestamp_dev_responded              0
developer_response                   0
dtype: int64

#                                                           TEXT NORMALIZATION

## Data Cleaning

In [4]:
# dropping those with empty reviews
data_8K = data_8K.dropna(subset=['review'])

# Convert the 'review' column to string
data_8K.loc[:, 'review'] = data_8K['review'].astype(str)


# Convert 'review' column to lowercase
data_8K.loc[:, 'review'] = data_8K.loc[:, 'review'].str.lower()



print(data_8K.loc[:, 'review'].head(60))

0                                                   yes
1     very fun. have a playlist of music to enhance ...
2                                           grat game\n
3     no matter what you choose, enabled/disabled, o...
4     so much fun. sounds boring, but i have found i...
5     very good game but dont play it too much or it...
6     i love the game its fun to play and i love rig...
7                                                   ehh
8     great sim game, there are a few little dislike...
9     relax gameplay, pretty fun and gives you a sen...
10    its a great game but much better with a steeri...
11    love the detail and realism in this game a  mu...
12    ats is one of those games where you can escape...
13    awesome game my only question is where are the...
14         love this game want to drive a truck one day
15              amazing game! buy the heavy cargo pack.
16                            it is a good game to play
17       alright, needs more macks and overall a

### Expanding Contractions

In [5]:
# Expanding contractions

CONTRACTION_MAP = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub('"', '', expanded_text)
    return expanded_text

# Expand contractions
data_8K.loc[:, 'review'] = data_8K.loc[:, 'review'].apply(expand_contractions)

print(data_8K.loc[:, 'review'].head(60))

0                                                   yes
1     very fun. have a playlist of music to enhance ...
2                                           grat game\n
3     no matter what you choose, enabled/disabled, o...
4     so much fun. sounds boring, but i have found i...
5     very good game but dont play it too much or it...
6     i love the game its fun to play and i love rig...
7                                                   ehh
8     great sim game, there are a few little dislike...
9     relax gameplay, pretty fun and gives you a sen...
10    its a great game but much better with a steeri...
11    love the detail and realism in this game a  mu...
12    ats is one of those games where you can escape...
13    awesome game my only question is where are the...
14         love this game want to drive a truck one day
15              amazing game! buy the heavy cargo pack.
16                            it is a good game to play
17       alright, needs more macks and overall a

### Removing Punctations

In [6]:
# Removing punctations

def remove_punct(my_str):
    # define punctuation
    punctuations = '''!()-[]{};:'.'"\,<>./?@#$%^-&_0123456789~*'''
    
    # remove punctuation from the string
    no_punct = ""  
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
        
    return no_punct


# Apply strip to remove leading and trailing spaces from each string in the ‘review’ column
data_8K.loc[:, 'review'] = data_8K.loc[:, 'review'].str.strip()


# Remove punctuation
data_8K.loc[:, 'review'] = data_8K['review'].apply(remove_punct)





print(data_8K.loc[:, 'review'].head(60))

0                                                   yes
1     very fun have a playlist of music to enhance e...
2                                             grat game
3     no matter what you choose enableddisabled onof...
4     so much fun sounds boring but i have found it ...
5     very good game but dont play it too much or it...
6     i love the game its fun to play and i love rig...
7                                                   ehh
8     great sim game there are a few little dislike ...
9     relax gameplay pretty fun and gives you a sens...
10    its a great game but much better with a steeri...
11    love the detail and realism in this game a  mu...
12    ats is one of those games where you can escape...
13    awesome game my only question is where are the...
14         love this game want to drive a truck one day
15                amazing game buy the heavy cargo pack
16                            it is a good game to play
17          alright needs more macks and overall

### Remove Repeating Characters

In [7]:
# Removing repeating characters
    
def remove_repeating_chars(s):
    return re.sub(r'(.)\1+', r'\1', s)

# Assuming df is your DataFrame and 'column' is the name of the column containing the strings
data_8K.loc[:, 'review'] = data_8K.loc[:, 'review'].apply(remove_repeating_chars)

print(data_8K.loc[:, 'review'].head(60))

0                                                   yes
1     very fun have a playlist of music to enhance e...
2                                             grat game
3     no mater what you chose enabledisabled onof vo...
4     so much fun sounds boring but i have found it ...
5     very god game but dont play it to much or it g...
6     i love the game its fun to play and i love rig...
7                                                    eh
8     great sim game there are a few litle dislike b...
9     relax gameplay prety fun and gives you a sense...
10    its a great game but much beter with a stering...
11    love the detail and realism in this game a mus...
12    ats is one of those games where you can escape...
13    awesome game my only question is where are the...
14         love this game want to drive a truck one day
15                amazing game buy the heavy cargo pack
16                             it is a god game to play
17            alright neds more macks and overal

### Removing Stopwords

In [8]:
# Stopwords file
file_path = '/kaggle/input/stopwords/stopwords.txt'

# Open the file and read the lines
with open(file_path, 'r') as f:
    stop_words = f.readlines()

# Remove any newline characters
stop_words = [word.strip() for word in stop_words]


def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


# Apply the function to the data set
data_8K.loc[:, 'review'] = data_8K.loc[:, 'review'].apply(remove_stopwords)


print(data_8K.loc[:, 'review'].head(60))

0                                                   yes
1                 fun playlist music enhance experience
2                                             grat game
3     mater chose enabledisabled onof volume wil lis...
4     fun sounds boring found relaxing play using st...
5      god game dont play boring play smal amounts time
6            love game fun play love rigs tel play game
7                                                    eh
8     great sim game litle dislike al love game grap...
9     relax gameplay prety fun gives sense purpose m...
10          great game beter stering whels skrs shifter
11    love detail realism game buy truck simulation ...
12    ats games can escape world imersed can prety s...
13             awesome game only question freightliners
14                       love game want drive truck day
15                    amazing game buy heavy cargo pack
16                                        god game play
17                     alright neds macks overal

## Tokenization

In [9]:
def tokenizeText (text) :
    word_tokens = nltk.word_tokenize(text)
    return word_tokens

# Tokenize the dataset
data_8K.loc[:, 'tokens'] = data_8K['review'].apply(tokenizeText)

print(data_8K.loc[:, 'tokens'].head(60))

0                                                 [yes]
1           [fun, playlist, music, enhance, experience]
2                                          [grat, game]
3     [mater, chose, enabledisabled, onof, volume, w...
4     [fun, sounds, boring, found, relaxing, play, u...
5     [god, game, dont, play, boring, play, smal, am...
6     [love, game, fun, play, love, rigs, tel, play,...
7                                                  [eh]
8     [great, sim, game, litle, dislike, al, love, g...
9     [relax, gameplay, prety, fun, gives, sense, pu...
10    [great, game, beter, stering, whels, skrs, shi...
11    [love, detail, realism, game, buy, truck, simu...
12    [ats, games, can, escape, world, imersed, can,...
13       [awesome, game, only, question, freightliners]
14                [love, game, want, drive, truck, day]
15             [amazing, game, buy, heavy, cargo, pack]
16                                    [god, game, play]
17               [alright, neds, macks, overal, 

### Stemming

In [10]:
# Initialize the stemmer
porter_stemmer = PorterStemmer()


# Stem each word in the tokenized sentence
data_8K['stemmed_tokens'] = data_8K['tokens'].apply(lambda x : [porter_stemmer.stem(y) for y in x])

print(data_8K['stemmed_tokens'].head(60))


0                                                  [ye]
1                [fun, playlist, music, enhanc, experi]
2                                          [grat, game]
3     [mater, chose, enabledis, onof, volum, wil, li...
4     [fun, sound, bore, found, relax, play, use, st...
5     [god, game, dont, play, bore, play, smal, amou...
6     [love, game, fun, play, love, rig, tel, play, ...
7                                                  [eh]
8     [great, sim, game, litl, dislik, al, love, gam...
9     [relax, gameplay, preti, fun, give, sens, purp...
10      [great, game, beter, stere, whel, skr, shifter]
11    [love, detail, realism, game, buy, truck, simu...
12    [at, game, can, escap, world, imers, can, pret...
13           [awesom, game, onli, question, freightlin]
14                [love, game, want, drive, truck, day]
15                [amaz, game, buy, heavi, cargo, pack]
16                                    [god, game, play]
17                    [alright, ned, mack, over,

# TEXT CLASSIFICATION

In [11]:
# Splitting the dataset into train and test

reviews = data_8K['stemmed_tokens'].values
y = data_8K['voted_up'].values

reviews_train, reviews_test, y_train, y_test = train_test_split(reviews,y,test_size=0.30,random_state=101)

In [12]:
# Join the tokens into a single string for each document
reviews_train = [' '.join(doc) for doc in reviews_train]
reviews_test = [' '.join(doc) for doc in reviews_test]


In [13]:
# Count Vectorizer
vectorizer = CountVectorizer()
vectorizer.fit(reviews_train)

X_train = vectorizer.transform(reviews_train)
X_test = vectorizer.transform(reviews_test)

In [14]:
# Create a DataFrame
num_rows = pd.DataFrame({
    'Data Set': ['Training Data', 'Testing Data'],
    'Number of Rows': [X_train.shape[0], X_test.shape[0]]
})

num_rows

Unnamed: 0,Data Set,Number of Rows
0,Training Data,5577
1,Testing Data,2391


In [15]:
# Count of words

word_list = vectorizer.get_feature_names_out()
count_list = np.asarray(X_train.sum(axis=0))[0]

# Create a DataFrame from the feature(reviews) and counts
counts = pd.DataFrame({'Word': word_list, 'Count': count_list})

# Sort the DataFrame by 'Count' in descending order
counts = counts.sort_values('Count', ascending=False)

print(counts)

           Word  Count
2126       game   4842
5451      truck   1938
2243        god   1097
1528      drive   1095
3921       play   1081
...         ...    ...
3314   miniatur      1
843       chate      1
3316    minimum      1
3317  miniworld      1
0         aband      1

[6208 rows x 2 columns]


In [16]:
# Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)

In [17]:
# Prediction & Performance
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'],colnames =['Predicted'], margins = True)

Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2,51,53
True,9,2329,2338
All,11,2380,2391


In [18]:
# Accuracy

score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.9749058971141782


In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.18      0.04      0.06        53
        True       0.98      1.00      0.99      2338

    accuracy                           0.97      2391
   macro avg       0.58      0.52      0.52      2391
weighted avg       0.96      0.97      0.97      2391



# End of codes...