In [None]:
import os
import torch
import numpy as np
import pandas as pd
import ast
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.utils import resample
import time


# Gelişmiş Kod Analizcisi
class EnhancedCodeAnalyzer:
    def __init__(self):
        drive.mount('/content/drive', force_remount=True)
        self.harmless_path = '/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/gemini_ai/Harmless'
        self.malicious_path = '/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/gemini_ai/Malicious'

    def load_code_files(self, directory):
        code_files = []
        for filename in os.listdir(directory):
            if filename.endswith('.txt'):
                with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                    code_files.append(file.read())
        return code_files

    def prepare_dataset(self):
        harmless_codes = self.load_code_files(self.harmless_path)
        malicious_codes = self.load_code_files(self.malicious_path)

        dataset = pd.DataFrame({
            'code': harmless_codes + malicious_codes,
            'label': ['harmless']*len(harmless_codes) + ['malicious']*len(malicious_codes)
        })
        return dataset

    def extract_advanced_features(self, code_snippet):
        features = {
            'code_length': len(code_snippet),
            'unique_tokens': len(set(code_snippet.split())),
            'line_count': len(code_snippet.split('\n')),
            'complexity_score': code_snippet.count('{') + code_snippet.count('}')
                                + code_snippet.count('(') + code_snippet.count(')'),
            'function_count': code_snippet.count('def ') + code_snippet.count('function '),
            'import_count': code_snippet.count('import '),
            'arithmetic_ops': code_snippet.count('+') + code_snippet.count('-') +
                              code_snippet.count('*') + code_snippet.count('/')
        }
        return features

    def extract_ast_features(self, code_snippet):
        try:
            tree = ast.parse(code_snippet)
        except SyntaxError:
            return {'num_functions': 0, 'num_loops': 0, 'num_conditionals': 0}

        class ASTFeatureExtractor(ast.NodeVisitor):
            def __init__(self):
                self.num_functions = 0
                self.num_loops = 0
                self.num_conditionals = 0

            def visit_FunctionDef(self, node):
                self.num_functions += 1
                self.generic_visit(node)

            def visit_For(self, node):
                self.num_loops += 1
                self.generic_visit(node)

            def visit_While(self, node):
                self.num_loops += 1
                self.generic_visit(node)

            def visit_If(self, node):
                self.num_conditionals += 1
                self.generic_visit(node)

        extractor = ASTFeatureExtractor()
        extractor.visit(tree)

        return {
            'num_functions': extractor.num_functions,
            'num_loops': extractor.num_loops,
            'num_conditionals': extractor.num_conditionals
        }

    def prepare_manual_ast_features(self, dataset):
        manual_features = dataset['code'].apply(self.extract_advanced_features).apply(pd.Series)
        ast_features = dataset['code'].apply(self.extract_ast_features).apply(pd.Series)
        combined_features = np.hstack([manual_features, ast_features])
        return combined_features

    def prepare_labels(self, dataset):
        return dataset['label'].apply(lambda x: 1 if x == 'malicious' else 0).values

# Ana İşlem
analyzer = EnhancedCodeAnalyzer()
dataset = analyzer.prepare_dataset()
X = analyzer.prepare_manual_ast_features(dataset)
y = analyzer.prepare_labels(dataset)

# Veriyi eğitim ve test kümelerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ölçeklendirme (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Sınıflandırıcılar ve GridSearch hiperparametreleri
classifiers = {
    'GaussianNB': GaussianNB(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

# Özelleştirilmiş değerlendirme metrikleri
scoring = {
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score, pos_label=1)
}

# GridSearch için hiperparametre aralıkları (DİKKAT: Uygun aralıkları belirleyin!)
param_grids = {
    'GaussianNB': {},
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['liblinear', 'lbfgs']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
     'Decision Tree':{
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
        'SVM': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear']
    },
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    },
        'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2]
    }
}

# Modelleri eğit ve değerlendir (Grid Search ile)
results = {}
grid_search_results = {}

for name, clf in classifiers.items():
    print(f"\n{name} için GridSearch çalıştırılıyor...")
    param_grid = param_grids[name]

    # Stratified K-Fold oluştur
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring=scoring,
        refit='f1',  # En iyi modeli F1 skora göre seç
        cv=cv,  # Stratified K-Fold kullan
        verbose=2,
        n_jobs=-1
    )

    start_time = time.time()
    grid_search.fit(X_train_scaled, y_train)
    end_time = time.time()
    training_time = end_time - start_time

    # En iyi modeli al
    best_model = grid_search.best_estimator_

    # Test seti üzerinde değerlendirme
    y_pred = best_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

    # En iyi skor ve parametreleri kaydet
    best_params = grid_search.best_params_
    best_f1_score = grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]
    best_precision = grid_search.cv_results_['mean_test_precision'][grid_search.best_index_]
    best_recall = grid_search.cv_results_['mean_test_recall'][grid_search.best_index_]
    best_accuracy = grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]

    grid_search_results[name] = {
        'best_params': best_params,
        'metrics': {
            'f1': best_f1_score,
            'precision': best_precision,
            'recall': best_recall,
            'accuracy': best_accuracy
        },
        'training_time': training_time
    }


# Sonuçları yazdırma
print("\nModel Sonuçları (Test Seti Üzerinde):")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

# GridSearch sonuçlarını yazdırma
print("\nTüm Modeller İçin GridSearch Sonuçları:")
for model_name, result in grid_search_results.items():
    print(f"\nModel: {model_name}")
    if 'best_params' in result:
        print(f"  En İyi Hiperparametreler = {result['best_params']}")
    print(f"  Performans:")
    for metric, score in result['metrics'].items():
        print(f"    {metric.capitalize()}: {score:.4f}")
    print(f"   Eğitim Süresi: {result['training_time']:.2f} saniye")

# CSV Dosyasına Kaydetme
output_dir = "/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/gemini_ai/feature_outputs/manuel"
os.makedirs(output_dir, exist_ok=True)

df = pd.DataFrame(X_train_scaled)
df['label'] = y_train  # Eğitim etiketlerini ekle
csv_path = os.path.join(output_dir, "manual_ast_features_train.csv")
df.to_csv(csv_path, index=False)
print(f"✔ 'manual+ast' training feature set kaydedildi: {csv_path}")

df_test = pd.DataFrame(X_test_scaled)
df_test['label'] = y_test  # Test etiketlerini ekle
csv_path = os.path.join(output_dir, "manual_ast_features_test.csv")
df_test.to_csv(csv_path, index=False)
print(f"✔ 'manual+ast' test feature set kaydedildi: {csv_path}")

# Görselleştirme (İlk 2 Feature ile)
def visualize_features(features, labels, output_dir="/content/drive/MyDrive/yukseklisans/metinmadenciligi/Dataset/gemini_ai/feature_outputs/manuel", filename="manual_ast_feature_visualization.png"):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.DataFrame(features[:, :2], columns=['Feature_1', 'Feature_2'])
    df['label'] = labels

    plt.figure(figsize=(8, 6))
    sns.scatterplot(x="Feature_1", y="Feature_2", hue="label", data=df, palette="coolwarm", alpha=0.7)
    plt.title("Feature Distribution - manual+ast")
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.legend(title="Label")

    image_path = os.path.join(output_dir, filename)
    plt.savefig(image_path)
    print(f"📊 Feature visualization saved: {image_path}")
    plt.show()

visualize_features(X_train_scaled, y_train, output_dir, "manual_ast_feature_visualization_train.png")
visualize_features(X_test_scaled, y_test, output_dir, "manual_ast_feature_visualization_test.png")