# Train Model Notebook
Bu dosya veri setini yükler, önişleme yapar ve modelleri eğitir.

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
# Veri setini yükle
file_path = 'data/fake_reviews_dataset2.csv'  # dosya adını kontrol et!
dfall = pd.read_csv(file_path)
df = dfall.sample(frac=0.6, random_state=42)  
df['label'] = df['label'].map({'CG': 0, 'OR': 1})
df = df.rename(columns={'text_': 'review'})

In [4]:
# Temizleme ve öznitelik çıkarımı
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_review'] = df['review'].apply(clean_text)
df['review_length'] = df['review'].apply(lambda x: len(str(x)))
df['exclamation_count'] = df['review'].apply(lambda x: str(x).count('!'))
df['capital_word_ratio'] = df['review'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]) / (len(str(x).split()) + 1e-5))

In [5]:
# TF-IDF + Sayısal birleştirme
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_tfidf = tfidf.fit_transform(df['clean_review'])
X_numeric = df[['review_length', 'exclamation_count', 'capital_word_ratio']].values
X_combined = hstack([X_tfidf, X_numeric])
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

In [6]:

# X_test ve y_test'i kaydet
joblib.dump(X_test, 'models/X_test_combined_dataset2.pkl')
joblib.dump(y_test, 'models/y_test_labels_dataset2.pkl')


['models/y_test_labels_dataset2.pkl']

In [7]:

# TF-IDF vektörleştiricisini kaydet
joblib.dump(tfidf, 'models/tfidf_vectorizer_dataset2.pkl')


['models/tfidf_vectorizer_dataset2.pkl']

In [8]:
# [Yorumlandı - optimize edilmemiş model]
# # Model eğitimi
# os.makedirs('modeller', exist_ok=True)
# nb_model = MultinomialNB().fit(X_train, y_train)
# joblib.dump(nb_model, 'modeller/naive_bayes_model_dataset2.pkl')
# svm_model = LinearSVC().fit(X_train, y_train)
# joblib.dump(svm_model, 'modeller/svm_model_dataset2.pkl')
# mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42).fit(X_train, y_train)
# joblib.dump(mlp_model, 'modeller/mlp_model_dataset2.pkl')

In [9]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# LightGBM için parametre aralığı
param_grid = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 300],
    'max_depth': [5, 10],
    'num_leaves': [15, 31]
}

# GridSearch ile model optimizasyonu
lgb_model = lgb.LGBMClassifier(random_state=42)
grid_lgb = GridSearchCV(lgb_model, param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
grid_lgb.fit(X_train, y_train)

print("En iyi parametreler:", grid_lgb.best_params_)

# En iyi model
best_lgb_model = grid_lgb.best_estimator_
joblib.dump(best_lgb_model, "models/lightgbm_model_dataset2.pkl")


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[WinError 2] Sistem belirtilen dosyayı bulamıyor
  File "C:\Users\kayra\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\kayra\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kayra\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\kayra\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Number of positive: 3256, number of negative: 3212
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106886
[LightGBM] [Info] Number of data points in the train set: 6468, number of used features: 3514
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503401 -> initscore=0.013606
[LightGBM] [Info] Start training from score 0.013606
En iyi parametreler: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300, 'num_leaves': 15}


['models/lightgbm_model_dataset2.pkl']

In [10]:
#from tensorflow.keras.preprocessing.text import Tokenizer
#from tensorflow.keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
#from tensorflow.keras.callbacks import EarlyStopping
#from tensorflow.keras.models import load_model

# Tokenizer ayarları
#tokenizer = Tokenizer(num_words=10000)
#tokenizer.fit_on_texts(df['clean_review'])

#X_seq = tokenizer.texts_to_sequences(df['clean_review'])
#X_pad = pad_sequences(X_seq, maxlen=200)
#y_lstm = df['label'].values

# Eğitim/test ayır
#X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_pad, y_lstm, test_size=0.2, stratify=y_lstm, random_state=42)

# Model mimarisi
#model = Sequential()
#model.add(Embedding(input_dim=10000, output_dim=64, input_length=200))
#model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(1, activation='sigmoid'))

#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Eğitim
#early = EarlyStopping(patience=2, restore_best_weights=True)
#model.fit(X_train_lstm, y_train_lstm, validation_split=0.2, epochs=10, batch_size=128, callbacks=[early])

# Kaydet
#model.save("models/lstm_model_dataset2.h5")
#joblib.dump(tokenizer, "models/lstm_tokenizer_dataset2.pkl")


In [11]:

# LSTM test verilerini kaydet
#joblib.dump(X_test_lstm, "models/X_test_lstm_dataset2.pkl")
#joblib.dump(y_test_lstm, "models/y_test_lstm_dataset2.pkl")


In [12]:

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

nb_params = {'alpha': [0.1, 0.5, 1.0]}
nb_grid = GridSearchCV(MultinomialNB(), nb_params, cv=3, scoring='f1_macro')
nb_grid.fit(X_train, y_train)
best_nb = nb_grid.best_estimator_
joblib.dump(best_nb, "models/naive_bayes_model_dataset2.pkl")


['models/naive_bayes_model_dataset2.pkl']

In [13]:

from sklearn.svm import LinearSVC

svm_params = {'C': [0.1, 1.0, 10.0]}
svm_grid = GridSearchCV(LinearSVC(), svm_params, cv=3, scoring='f1_macro')
svm_grid.fit(X_train, y_train)
best_svm = svm_grid.best_estimator_
joblib.dump(best_svm, "models/svm_model_dataset2.pkl")




['models/svm_model_dataset2.pkl']

In [14]:

from sklearn.neural_network import MLPClassifier

mlp_params = {
    'hidden_layer_sizes': [(64,), (100,)],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}
mlp_grid = GridSearchCV(MLPClassifier(max_iter=300), mlp_params, cv=3, scoring='f1_macro')
mlp_grid.fit(X_train, y_train)
best_mlp = mlp_grid.best_estimator_
joblib.dump(best_mlp, "models/mlp_model_dataset2.pkl")


['models/mlp_model_dataset2.pkl']