In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from scipy.sparse import hstack
from imblearn.over_sampling import RandomOverSampler
import joblib


CSV_FILE = "case_cu_pret_estim.csv"

df = pd.read_csv(CSV_FILE)

needed_cols = ["descriere", "data_cont", "nr_postari", "pret", "pret_estim", "scam"]
for c in needed_cols:
    if c not in df.columns:
        raise ValueError(f"LipseÈ™te coloana '{c}' din {CSV_FILE}")

df["pret"] = pd.to_numeric(df["pret"], errors="coerce")
df["pret_estim"] = pd.to_numeric(df["pret_estim"], errors="coerce")
df["nr_postari"] = pd.to_numeric(df["nr_postari"], errors="coerce").fillna(0)

df["delta_pret"] = df["pret"] - df["pret_estim"]

df["data_cont_parsed"] = pd.to_datetime(
    df["data_cont"], errors="coerce", dayfirst=True
)
today = pd.Timestamp.today()
df["vechime_zile"] = (today - df["data_cont_parsed"]).dt.days
df["vechime_zile"] = df["vechime_zile"].fillna(df["vechime_zile"].median())

df = df.dropna(subset=["pret", "pret_estim", "delta_pret"])

y = df["scam"].astype(int)
descrieri = df["descriere"].fillna("")

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=3
)
X_text = vectorizer.fit_transform(descrieri)

X_num = np.column_stack([
    df["vechime_zile"].values,
    df["nr_postari"].values,
    df["delta_pret"].values
])

X = hstack([X_text, X_num])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)

print("ðŸŽ¯ Exemplare dupÄƒ oversampling:")
print(pd.Series(y_train_bal).value_counts())


clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

clf.fit(X_train_bal, y_train_bal)


y_pred = clf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))


joblib.dump(clf, "scam_model_logreg.pkl")
joblib.dump(vectorizer, "tfidf_descriere.pkl")

print("\nâœ… Modelul Logistic Regression a fost salvat Ã®n 'scam_model_logreg.pkl'")
print("âœ… Vectorizatorul TF-IDF a fost salvat Ã®n 'tfidf_descriere.pkl'")


ðŸŽ¯ Exemplare dupÄƒ oversampling:
scam
0    466
1    466
Name: count, dtype: int64

Accuracy: 1.0

Confusion matrix:
 [[117   0]
 [  0   3]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       117
           1       1.00      1.00      1.00         3

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120


âœ… Modelul Logistic Regression a fost salvat Ã®n 'scam_model_logreg.pkl'
âœ… Vectorizatorul TF-IDF a fost salvat Ã®n 'tfidf_descriere.pkl'
