In [None]:
import os
import torch
import numpy as np
import pandas as pd
from google.colab import drive
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from transformers import AutoModelForSeq2SeqLM
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import time
import seaborn as sns
import matplotlib.pyplot as plt

class CodeDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class EnhancedCodeAnalyzer:
    def __init__(self, codebert_model_name="microsoft/codebert-base", codet5_model_name="Salesforce/codet5-base"):
        drive.mount('/content/drive', force_remount=True)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # CodeBERT
        self.codebert_tokenizer = AutoTokenizer.from_pretrained(codebert_model_name)
        self.codebert_model = AutoModel.from_pretrained(
            codebert_model_name,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        ).to(self.device)
        # CodeT5
        self.codet5_tokenizer = AutoTokenizer.from_pretrained(codet5_model_name)
        self.codet5_model = AutoModelForSeq2SeqLM.from_pretrained(
            codet5_model_name,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        ).to(self.device)
        self.harmless_path = '/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/GPT3.5/Harmless'
        self.malicious_path = '/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/GPT3.5/Malicious'

    def load_code_files(self, directory):
        code_files = []
        for filename in os.listdir(directory):
            if filename.endswith('.txt'):
                with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                    code_files.append(file.read())
        return code_files

    def prepare_dataset(self):
        harmless_codes = self.load_code_files(self.harmless_path)
        malicious_codes = self.load_code_files(self.malicious_path)

        dataset = pd.DataFrame({
            'code': harmless_codes + malicious_codes,
            'label': ['harmless'] * len(harmless_codes) + ['malicious'] * len(malicious_codes)
        })
        return dataset

    def get_codebert_embedding(self, code_snippet):
        inputs = self.codebert_tokenizer(code_snippet, return_tensors='pt', max_length=512, truncation=True,
                                         padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.codebert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()

    def get_codet5_embedding(self, code_snippet):
        inputs = self.codet5_tokenizer(code_snippet, return_tensors='pt', max_length=512, truncation=True,
                                         padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.codet5_model.encoder(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()

    def prepare_all_feature_sets(self, dataset):
        codebert_embeddings = self.prepare_codebert_embeddings(dataset)
        codet5_embeddings = self.prepare_codet5_embeddings(dataset)

        feature_sets = {
            "codebert": codebert_embeddings,
            "codet5": codet5_embeddings
        }
        return feature_sets

    def prepare_codebert_embeddings(self, dataset):
        embeddings = [self.get_codebert_embedding(code) for code in dataset['code']]
        return np.vstack(embeddings)

    def prepare_codet5_embeddings(self, dataset):
        embeddings = [self.get_codet5_embedding(code) for code in dataset['code']]
        return np.vstack(embeddings)

    def prepare_labels(self, dataset):
        return dataset['label'].apply(lambda x: 1 if x == 'malicious' else 0).values

    def visualize_features(self, feature_sets, labels, output_dir="/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/GPT3.5/feature_outputs"):
        os.makedirs(output_dir, exist_ok=True)
        for key, features in feature_sets.items():
            df = pd.DataFrame(features[:, :2], columns=['Feature_1', 'Feature_2'])  # İlk 2 feature'ı al
            df['label'] = labels

            plt.figure(figsize=(8, 6))
            sns.scatterplot(x="Feature_1", y="Feature_2", hue="label", data=df, palette="coolwarm", alpha=0.7) # data parametresi eklendi
            plt.title(f"Feature Distribution - {key}")
            plt.xlabel("Feature 1")
            plt.ylabel("Feature 2")
            plt.legend(title="Label")

            image_path = os.path.join(output_dir, f"{key}_feature_visualization.png")
            plt.savefig(image_path)
            print(f"📊 Feature visualization saved: {image_path}")
            plt.show()


# Veri hazırlama
analyzer = EnhancedCodeAnalyzer()
dataset = analyzer.prepare_dataset()
feature_sets = analyzer.prepare_all_feature_sets(dataset)
y = analyzer.prepare_labels(dataset)

scaled_feature_sets = {}
for key, features in feature_sets.items():
    scaler = StandardScaler()
    scaled_feature_sets[key] = scaler.fit_transform(features)

# Train-test split
train_test_data = {}
for key, X in scaled_feature_sets.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    train_test_data[key] = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }

# Sınıflandırıcılar ve GridSearch hiperparametreleri
classifiers = {
    'GaussianNB': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

# Özelleştirilmiş değerlendirme metrikleri
scoring = {
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score, pos_label=1)
}

# GridSearch için hiperparametre aralıkları
param_grids = {
    'GaussianNB': {},
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['liblinear', 'lbfgs']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
     'Decision Tree':{
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
        'SVM': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear']
    },
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    },
        'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2]
    }
}
# GridSearch sonuçlarını depolamak için
grid_search_results = {}

# Her veri kümesi ve sınıflandırıcı için GridSearch uygulama
for feature_set_name, data in train_test_data.items():
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']
    print(f"\n==================================================")
    print(f"Veri Kümesi: {feature_set_name}")
    print(f"==================================================")
    for name, clf in classifiers.items():
        print(f"\n{name} için GridSearch çalıştırılıyor...")
        param_grid = param_grids[name]

        # Stratified K-Fold oluştur
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        grid_search = GridSearchCV(
            estimator=clf,
            param_grid=param_grid,
            scoring=scoring,
            refit='f1',  # En iyi modeli F1 skora göre seç
            cv=cv,  # Stratified K-Fold kullan
            verbose=2,
            n_jobs=-1
        )
        start_time = time.time()
        grid_search.fit(X_train, y_train)
        end_time = time.time()
        training_time = end_time - start_time

        # En iyi skor ve parametreleri kaydet
        best_params = grid_search.best_params_
        best_f1_score = grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]
        best_precision = grid_search.cv_results_['mean_test_precision'][grid_search.best_index_]
        best_recall = grid_search.cv_results_['mean_test_recall'][grid_search.best_index_]
        best_accuracy = grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]

        grid_search_results[(feature_set_name, name)] = {
            'best_params': best_params,
            'metrics': {
                'f1': best_f1_score,
                'precision': best_precision,
                'recall': best_recall,
                'accuracy': best_accuracy
            },
            'training_time': training_time
        }

        print(f"\n{name} için En İyi Performans:")
        print(f"F1 Skor: {best_f1_score}")
        print(f"Precision: {best_precision}")
        print(f"Recall: {best_recall}")
        print(f"Accuracy: {best_accuracy}")
        print(f"Eğitim Süresi: {training_time:.2f} saniye")
        print(f"En İyi Hiperparametreler: {best_params}")


# GridSearch sonuçlarını yazdırma
print("\nTüm Modeller İçin GridSearch Sonuçları:")
for (feature_set_name, model_name), result in grid_search_results.items():
    print(f"\nVeri Kümesi: {feature_set_name}, Model: {model_name}")
    if 'best_params' in result:
        print(f"  En İyi Hiperparametreler = {result['best_params']}")
    print(f"  Performans:")
    for metric, score in result['metrics'].items():
        print(f"    {metric.capitalize()}: {score:.4f}")
    print(f"   Eğitim Süresi: {result['training_time']:.2f} saniye")

# Görselleştirme
analyzer.visualize_features(scaled_feature_sets, y)

# **CSV Dosyalarına Kaydetme**
output_dir = "/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/GPT3.5/feature_outputs"  # Google Drive'daki hedef dizin
os.makedirs(output_dir, exist_ok=True)

for key, features in scaled_feature_sets.items():
    df = pd.DataFrame(features)
    df['label'] = y  # Etiketleri ekle
    csv_path = os.path.join(output_dir, f"{key}_features.csv")
    df.to_csv(csv_path, index=False)
    print(f"✔ Feature set '{key}' kaydedildi: {csv_path}")