In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

file_path = "E:\\Download-E\\Progetto-Uni\\Progetto-SA-YT\\data\\raw\\YoutubeCommentsDataSet.csv"
df = pd.read_csv(file_path, encoding='utf-8')


YT = pd.read_csv(os.path.join(config.RAW_DATA_PATH,'YoutubeCommentsDataSet.csv'))
    
df = YT[['Comment', 'Sentiment']]
df = df.rename(columns={"Comment": "text", "Sentiment": "sentiment"})
df = df.dropna() 
df.reset_index(drop=True, inplace=True)

def train_model(grid_search=False):
    """Trains a specified model and saves evaluation metrics to SQLite."""
    model_name= "random_forest"

    # Save original indices before vectorization
    df_indices = df.index

    # Feature extraction
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['cleaned_text'])
    y = df['sentiment']

    with open(f"{config.MODELS_PATH}vectorizer.pkl", 'wb') as f:
        pickle.dump(vectorizer, f)

    # Train-test split
    X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
        X, y, df_indices, test_size=0.2, random_state=42
    )

    # 📌 1️⃣ Selezione del modello e pre-processing specifico
    if model_name == "random_forest":
        model = RandomForestClassifier(random_state=42)
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    
    elif model_name == "logistic_regression":
        # Pre-processing specifico per la Regressione Logistica (Box-Cox)
        transformer = PowerTransformer(method='box-cox')
        X_train = transformer.fit_transform(X_train.toarray())  # Convertire in array prima di Box-Cox
        X_test = transformer.transform(X_test.toarray())

        model = LogisticRegression(max_iter=500)
        param_grid = {'C': [0.01, 0.1, 1, 10]}

    elif model_name == "naive_bayes":
        # Pre-processing specifico per Naive Bayes (niente trasformazioni complesse)
        model = MultinomialNB()
        param_grid = {'alpha': [0.1, 0.5, 1]}

    else:
        raise ValueError(f"Modello '{model_name}' non supportato.")

    # 📌 2️⃣ Addestramento con GridSearchCV (se richiesto)
    if grid_search:
        grid_search_cv = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
        grid_search_cv.fit(X_train, y_train)
        best_model = grid_search_cv.best_estimator_
    else:
        best_model = model
        best_model.fit(X_train, y_train)

    # 📌 3️⃣ Generazione delle previsioni
    y_pred = best_model.predict(X_test)

    # 📌 4️⃣ Salvataggio del modello
    model_path = f"{config.MODELS_PATH}{model_name}.pkl"
    with open(model_path, 'wb') as file:
        pickle.dump(best_model, file)

    print("Training Accuracy:", model.score(X_train, y_train))
    print("Test Accuracy:", model.score(X_test, y_test))


    # 📌 6️⃣ Calcolo delle metriche
    metrics = {
        'model': model_name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }



ModuleNotFoundError: No module named 'src'