In [4]:
# PY libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [None]:
# Read data from csv file 
steam = pd.read_csv(r'SteamReviews.csv')
steam.head()

In [None]:
# Change objects to int
steam = steam.astype({'app_id':'int', 'review_score':'int', 'review_votes':'int'})
print(steam.info())

In [None]:
# Drop nulls
print(steam.isnull().sum(axis = 0))
steam.dropna(inplace = True)
print(steam.info())
print(steam.isnull().sum(axis = 0))

In [None]:
# Change -1 to 0 in 'review_score'
counter = 0
for i in steam.index:
    if steam.loc[i, "review_score"] == -1:
        steam.loc[i, "review_score"] = 0
        counter += 1
print(counter)

In [None]:
# Add 'letters_count' to dataframe
steam['letters_count'] = steam['review_text'].apply(lambda x: len(x))
steam.head()

In [None]:
# New dataframe with filtered data by comment size
newSteam = steam[steam['letters_count'] > 10]
newSteam = newSteam[(newSteam['letters_count'] >= 25) | (newSteam['review_votes'] == 1)]
newSteam = newSteam.sort_values('letters_count')
newSteam.head()

In [2]:
# NEW FILE

# newSteam.to_csv('steam1.csv')
steam = pd.read_csv(r'steam1.csv')
steam.drop('Unnamed: 0', inplace=True, axis=1)
steam = steam.sort_values("app_id")
steam.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,letters_count
3899230,10,Counter-Strike,The classic. Although the graphics are an eyes...,1,0,439
218503,10,Counter-Strike,one of the best fps game!,1,0,25
884988,10,Counter-Strike,"The game that started it all, good for slower ...",1,0,50
218492,10,Counter-Strike,Full hacks everywheres ..,0,1,25
368001,10,Counter-Strike,THE BEST GAME ON THE WORLD !!!,1,0,30


In [3]:
# Set lower case
# Remove links
# Remove nums and symbols

def cleanSymbols(text):
    res = re.sub(r"[^a-zA-Z\s]+", "", text)
    return res

def cleanLinks(text):
    res = re.sub(r"http\s+", "", text)
    res = re.sub("w+://s+", "", res)
    return res

steam['review_text'] = steam['review_text'].astype(str).str.lower()
steam['review_text'] = steam['review_text'].apply(cleanLinks)
steam['review_text'] = steam['review_text'].apply(cleanSymbols)

steam.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,letters_count
3899230,10,Counter-Strike,the classic although the graphics are an eyeso...,1,0,439
218503,10,Counter-Strike,one of the best fps game,1,0,25
884988,10,Counter-Strike,the game that started it all good for slower pcs,1,0,50
218492,10,Counter-Strike,full hacks everywheres,0,1,25
368001,10,Counter-Strike,the best game on the world,1,0,30


In [21]:
nltk.download('stopwords')

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,letters_count
0,111900,Guardians of Middle-earth,smooth game,1,1,11
1,384190,ABZÛ,great game,1,1,11
2,298260,Only If,its a meme,0,1,11
3,218620,PAYDAY 2,nice game,1,1,11
4,287290,Resident Evil Revelations 2,is rly good,1,1,11


In [None]:
# Determine the correlation using a heatmap to visualize the data
plt.figure(figsize=(20, 10))
sns.heatmap(dataset.corr(), annot=True, cmap='Greens')
plt.show()

In [None]:
# Drop multicollinear feature
dataset = dataset.drop('share', axis=1)

# Drop features with very low correlation
dataset = dataset.drop(['age', 'months'], axis=1)

# Plot heamap
plt.figure(figsize= (20, 10))
sns.heatmap(dataset.corr(), annot=True, cmap='Greens')
plt.show()

In [None]:
# Split dataset
X = dataset.drop('card', axis=1)
y = dataset['card']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

X_train.head()

In [None]:
# Scaling
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Try different parameters
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

logistic_model = GridSearchCV(
    estimator=LogisticRegression(random_state=0, multi_class='ovr'),
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Train Logistic Regression
logistic_model.fit(X_train, y_train)

# Making predictions
logistic_predictions = logistic_model.predict(X_test)

# Calculate accuracy and print result
accuracy = accuracy_score(logistic_predictions, y_test) * 100
print('Accuracy of the model is {:.2f}'.format(accuracy))
print(logistic_model.best_params_)

# Display confusion matrix
cm = confusion_matrix(y_test, logistic_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

plt.savefig('logistic_confusion.png', dpi=300)

In [None]:
# Try different parameters
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

svm_model = GridSearchCV(
    estimator=svm.LinearSVC(random_state=100, max_iter=1000000),
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Train SVM 
svm_model.fit(X_train, y_train)

# Make predictions
svm_prediction = svm_model.predict(X_test)

# Calculate accuracy and print result
accuracy = accuracy_score(svm_prediction, y_test) * 100
print('Accuracy of the model is {:.2f}'.format(accuracy))
print(svm_model.best_params_)

# Display confusion matrix
cm = confusion_matrix(y_test, svm_prediction)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

plt.savefig('svm_confusion.png', dpi=300)

In [None]:
# Try different parameters
params = {
    'n_neighbors':  range(1, 15, 2),
    'p': [1, 2],
    'weights': ['uniform', 'distance']
}


knn_model = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Train KNN
knn_model.fit(X_train, y_train)

# Make prediction
knn_prediction = knn_model.predict(X_test)

# Calculate accuracy and print result
accuracy = accuracy_score(knn_prediction, y_test) * 100
print('Accuracy of the model is {:.2f}'.format(accuracy))
print(knn_model.best_params_)

# Display confusion matrix
cm = confusion_matrix(y_test, knn_prediction)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

plt.savefig('knn_confusion.png', dpi=300)