<a href="https://colab.research.google.com/github/Mert-Keskin/Makine-renmesi-ve-Derin-renme-Y-ntemleriyle-Metin-S-n-fland-rma-ve-LIME-ile-Yorumlanabilirlik/blob/main/ML_ve_DL_ile_Metin_S%C4%B1n%C4%B1fland%C4%B1rma_ve_LIME_ile_Yorumlanabilirlik.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
# --- Import Libraries ---
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')

# Scraping Functions

In [None]:
# --- Scraping Functions ---

def get_reviews(appid, params={'json': 1}):
    url = 'https://store.steampowered.com/appreviews/'
    response = requests.get(url=url + str(appid), params=params, headers={'User-Agent': 'Mozilla/5.0'})
    return response.json()

def get_n_reviews(appid, n=1000):
    reviews = []
    cursor = '*'
    params = {
        'json': 1,
        'filter': 'all',
        'language': 'english',
        'day_range': 9223372036854775807,
        'review_type': 'all',
        'purchase_type': 'all'
    }

    while n > 0:
        params['cursor'] = cursor.encode()
        params['num_per_page'] = min(100, n)
        n -= 100

        response = get_reviews(appid, params)
        cursor = response['cursor']
        reviews += response['reviews']

        if len(response['reviews']) < 100:
            break

    return reviews

def get_app_id(game_name):
    response = requests.get(url=f'https://store.steampowered.com/search/?term={game_name}&category1=998', headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    app_id = soup.find(class_='search_result_row')['data-ds-appid']
    return app_id

def scrape_reviews_for_single_game(game_name, n):
    appid = get_app_id(game_name)
    print(f"App ID for {game_name}: {appid}")
    reviews = get_n_reviews(appid, n)

    review_data = [{'review': review['review'], 'voted_up': review['voted_up']} for review in reviews]
    df = pd.DataFrame(review_data)
    return df

# Preprocessing

In [None]:
# --- Preprocessing ---

def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

def preprocess_reviews(df):
    df['clean_review'] = df['review'].astype(str).apply(preprocess_text)
    df['voted_up'] = df['voted_up'].astype(int)
    return df

# Evaluation Function

In [None]:
# 4. MODEL TRAINING AND EVALUATION
# --------------------------------------------
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Accuracy and Report
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, index=['Actual Negative (0)', 'Actual Positive (1)'],
                         columns=['Predicted Negative (0)', 'Predicted Positive (1)'])

    print("Confusion Matrix:\n", cm_df)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', linewidths=0.5, cbar=False)
    plt.title("Confusion Matrix Heatmap")
    plt.ylabel("Actual Label")
    plt.xlabel("Predicted Label")
    plt.show()

# Main Flow

In [None]:
# 1. Scrape data
df_reviews = scrape_reviews_for_single_game('Halo Infinite', 5000)

In [None]:
# 2. Preprocess
df = preprocess_reviews(df_reviews)

In [None]:
df.head()

# Word Cloud

In [None]:
from wordcloud import WordCloud
# --- Word Cloud ---
def plot_wordcloud(text_data, title="Word Cloud"):
    text = " ".join(text_data)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(15, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=20)
    plt.show()

# Generate word cloud for cleaned reviews
plot_wordcloud(df['clean_review'], title="Most Common Words in Reviews")

In [None]:
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(df_reviews['clean_review'], df_reviews['voted_up'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
if __name__ == "__main__":

    # Models to Try
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "Support Vector Machine (SVM)": SVC(kernel='linear')
    }

    # Train and Evaluate Each Model
    for model_name, model in models.items():
        print("="*50)
        print(f"Training {model_name}...")
        evaluate_model(model, X_train_tfidf, X_test_tfidf, y_train, y_test)

# Deep learning

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, GRU, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Tokenize and pad sequences
def prepare_dl_data(df, vocab_size=10000, max_len=200):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
    tokenizer.fit_on_texts(df['clean_review'])

    sequences = tokenizer.texts_to_sequences(df['clean_review'])
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

    return padded_sequences, tokenizer

# Plot training history
def plot_training_history(history, model_name='Model'):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Train Accuracy')
    plt.plot(epochs_range, val_acc, label='Val Accuracy')
    plt.legend()
    plt.title(f'{model_name} Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Train Loss')
    plt.plot(epochs_range, val_loss, label='Val Loss')
    plt.legend()
    plt.title(f'{model_name} Loss')

    plt.show()

# Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype(int).flatten()

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

# Dynamic model builder
def build_model(model_type, vocab_size, embedding_dim=64, input_length=200):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=input_length))

    if model_type == 'LSTM':
        model.add(LSTM(64))
    elif model_type == 'BiLSTM':
        model.add(Bidirectional(LSTM(64)))
    elif model_type == 'GRU':
        model.add(GRU(64))
    elif model_type == 'CNN':
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(GlobalMaxPooling1D())

    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Main function to train deep learning model
def train_deep_learning_model(df, model_type='LSTM', vocab_size=10000, max_len=200, embedding_dim=64, epochs=5, batch_size=32):
    padded_sequences, tokenizer = prepare_dl_data(df, vocab_size=vocab_size, max_len=max_len)
    X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['voted_up'], test_size=0.2, random_state=42)

    model = build_model(model_type, vocab_size, embedding_dim, max_len)
    print(f"Training {model_type}...")
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=2)

    plot_training_history(history, model_name=model_type)
    evaluate_model(model, X_test, y_test)
    return model

# Tokenize and pad sequences
def prepare_dl_data(df, vocab_size=10000, max_len=200):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
    tokenizer.fit_on_texts(df['clean_review'])

    sequences = tokenizer.texts_to_sequences(df['clean_review'])
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

    return padded_sequences, tokenizer

# Plot training history
def plot_training_history(history, model_name='Model'):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Train Accuracy')
    plt.plot(epochs_range, val_acc, label='Val Accuracy')
    plt.legend()
    plt.title(f'{model_name} Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Train Loss')
    plt.plot(epochs_range, val_loss, label='Val Loss')
    plt.legend()
    plt.title(f'{model_name} Loss')

    plt.show()

# Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype(int).flatten()

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

# Dynamic model builder
def build_model(model_type, vocab_size, embedding_dim=64, input_length=200):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=input_length))

    if model_type == 'LSTM':
        model.add(LSTM(64))
    elif model_type == 'BiLSTM':
        model.add(Bidirectional(LSTM(64)))
    elif model_type == 'GRU':
        model.add(GRU(64))
    elif model_type == 'CNN':
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(GlobalMaxPooling1D())

    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Main function to train deep learning model
def train_deep_learning_model(df, model_type='LSTM', vocab_size=10000, max_len=200, embedding_dim=64, epochs=20, batch_size=32):
    padded_sequences, tokenizer = prepare_dl_data(df, vocab_size=vocab_size, max_len=max_len)
    X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['voted_up'], test_size=0.2, random_state=42)

    model = build_model(model_type, vocab_size, embedding_dim, max_len)
    print(f"Training {model_type}...")

    # Early stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=2
    )

    plot_training_history(history, model_name=model_type)
    evaluate_model(model, X_test, y_test)
    return model

In [None]:
# Train LSTM
lstm_model = train_deep_learning_model(df, model_type='LSTM')

In [None]:
# Train CNN1
cnn_model = train_deep_learning_model(df, model_type='CNN')

In [None]:
# Train GRU
gru_model = train_deep_learning_model(df, model_type='GRU')

In [None]:
# Train BiLSTM
bilstm_model = train_deep_learning_model(df, model_type='BiLSTM')

# Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from textblob import TextBlob
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Concatenate, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# --- Assume your df is already loaded and cleaned ---
# If not, load or clean your reviews dataframe here

# --- Sentiment Analysis ---
def get_sentiment_score(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return 1
    elif polarity < 0:
        return -1
    else:
        return 0

df['sentiment_score'] = df['review'].astype(str).apply(get_sentiment_score)

# --- Tokenization ---
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['clean_review'])

sequences = tokenizer.texts_to_sequences(df['clean_review'])
padded_sequences = pad_sequences(sequences, maxlen=max_len)

X_text = padded_sequences
X_sentiment = df['sentiment_score'].values
y = df['voted_up'].values

# --- Train/Test Split ---
X_text_train, X_text_test, X_sent_train, X_sent_test, y_train, y_test = train_test_split(
    X_text, X_sentiment, y, test_size=0.2, random_state=42)

# --- Model Definition ---
embedding_dim = 64

# Input 1: Text
text_input = Input(shape=(max_len,), name='text_input')
x = Embedding(input_dim=max_words, output_dim=embedding_dim)(text_input)
x = Bidirectional(LSTM(64))(x)

# Input 2: Sentiment
sent_input = Input(shape=(1,), name='sentiment_input')

# Combine
combined = Concatenate()([x, sent_input])
combined = Dropout(0.3)(combined)
output = Dense(1, activation='sigmoid')(combined)

model = Model(inputs=[text_input, sent_input], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

# --- Training ---
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    {'text_input': X_text_train, 'sentiment_input': X_sent_train},
    y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)

# --- Evaluation ---
loss, accuracy = model.evaluate(
    {'text_input': X_text_test, 'sentiment_input': X_sent_test}, y_test)
print(f"\nTest Accuracy with Sentiment Feature: {accuracy:.4f}")

# Lime

In [None]:
!pip install lime

In [None]:
from lime.lime_text import LimeTextExplainer
import numpy as np

In [None]:
class_names = ['Negative', 'Positive']  # Assuming binary sentiment
max_len = 200  # Should match your BiLSTM input length

def lime_predict(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    probs = bilstm_model.predict(padded)
    # Return [prob_negative, prob_positive] format for each sample
    return np.hstack((1 - probs, probs))


In [None]:
explainer = LimeTextExplainer(class_names=class_names)

# Pick a sample review
idx = 0  # index of the test sample
sample_text = X_test.iloc[idx]  # this is raw text like "I loved the game..."

# Explain prediction
exp = explainer.explain_instance(sample_text, lime_predict, num_features=10)

# Show explanation in notebook
exp.show_in_notebook(text=sample_text)


In [None]:
# Visualize explanation
fig = exp.as_pyplot_figure(label=1)
plt.title("LIME Explanation for BiLSTM+Sentiment Model")
plt.tight_layout()
plt.show()