# Zadanie: Random Forest

Podziel zbiór na treningowy i testowy, pamiętaj o stratyfikacji. Następnie naucz model Random Forest, z którego wyciągnij feature importance. Na podstawie tego wykonaj selekcję cech i weź jedynie te, których ważność jest większa niż 0.001. Nauczy nowy model Random Forest z wyborem hiperperparametrów, korzystając z GridSearch. Wykorzystaj poznane techniki do wektoryzacji.

In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Ładowanie danych
spam_dataset = pd.read_csv('spam.csv', encoding="ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'], skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].map({'ham': 0, 'spam': 1}).astype(int)

# 2. Dodanie cech dodatkowych
spam_dataset['Text_Length'] = spam_dataset['Text'].apply(len)
spam_dataset['Num_Words'] = spam_dataset['Text'].apply(lambda x: len(x.split()))
spam_dataset['Has_Link'] = spam_dataset['Text'].str.contains('http').astype(int)

# 3. Przygotowanie tekstu - czyszczenie i lematyzacja
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Usuwanie znaków specjalnych i cyfr
    text = ''.join([char for char in text if char.isalpha() or char == ' '])
    # Zamiana na małe litery
    text = text.lower()
    # Usuwanie stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lematyzacja
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(clean_text)

# 4. Ulepszona wektoryzacja TF-IDF
tfidf = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=10000,
    min_df=2,
    max_df=0.8,
    stop_words='english'
)

X_text = tfidf.fit_transform(spam_dataset['Cleaned_Text'])
X_features = spam_dataset[['Text_Length', 'Num_Words', 'Has_Link']].values
X = np.hstack([X_text.toarray(), X_features])
y = spam_dataset['Spam']

# 5. Podział danych
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Model z balansowaniem klas
rf_balanced = RandomForestClassifier(
    class_weight='balanced',
    random_state=42
)
rf_balanced.fit(X_train, y_train)

# 7. Ewaluacja podstawowego modelu
y_pred_balanced = rf_balanced.predict(X_test)
print("\nPodstawowy model z balansowaniem klas:")
print(classification_report(y_test, y_pred_balanced))
print(f"Dokładność: {accuracy_score(y_test, y_pred_balanced):.4f}")

# 8. Selekcja cech
feature_importances = rf_balanced.feature_importances_
important_features = feature_importances > 0.001
X_train_selected = X_train[:, important_features]
X_test_selected = X_test[:, important_features]

# 9. Strojenie hiperparametrów
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_tuned = GridSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)
rf_tuned.fit(X_train_selected, y_train)

# 10. Ocena dostrojonego modelu
print("\nNajlepsze parametry:", rf_tuned.best_params_)
y_pred_tuned = rf_tuned.predict(X_test_selected)
print("\nDostrojony model TF-IDF:")
print(classification_report(y_test, y_pred_tuned))
print(f"Dokładność: {accuracy_score(y_test, y_pred_tuned):.4f}")

# 11. Alternatywna wektoryzacja - CountVectorizer
count_vectorizer = CountVectorizer(
    max_features=5000,
    ngram_range=(1, 3),  # Zwiększenie do trigramów
    min_df=2,
    max_df=0.7,
    stop_words='english'
)

X_count_text = count_vectorizer.fit_transform(spam_dataset['Cleaned_Text'])
X_count = np.hstack([X_count_text.toarray(), X_features])  # Dodajemy te same cechy dodatkowe

X_train_count, X_test_count, y_train_count, y_test_count = train_test_split(
    X_count, y, test_size=0.2, random_state=42, stratify=y
)

rf_count = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)
rf_count.fit(X_train_count, y_train_count)

y_pred_count = rf_count.predict(X_test_count)
print("\nModel z CountVectorizer:")
print(classification_report(y_test_count, y_pred_count))
print(f"Dokładność: {accuracy_score(y_test_count, y_pred_count):.4f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Podstawowy model z balansowaniem klas:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Dokładność: 0.9785
Fitting 5 folds for each of 108 candidates, totalling 540 fits

Najlepsze parametry: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

Dostrojony model TF-IDF:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.87      0.92       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Dokładność: 0.9803

Model z CountVectorizer:
              precision    recall  f1-scor