In [4]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   - -------------------------------------- 4.5/150.0 MB 24.5 MB/s eta 0:00:06
   --- ------------------------------------ 12.6/150.0 MB 31.6 MB/s eta 0:00:05
   ------ --------------------------------- 25.2/150.0 MB 40.9 MB/s eta 0:00:04
   ------- -------------------------------- 28.6/150.0 MB 34.9 MB/s eta 0:00:04
   ---------- ----------------------------- 40.6/150.0 MB 40.4 MB/s eta 0:00:03
   -------------- ------------------------- 53.2/150.0 MB 44.0 MB/s eta 0:00:03
   ---------------- ----------------------- 60.6/150.0 MB 42.9 MB/s eta 0:00:03
   ------------------ --------------------- 67.9/150.0 MB 42.9 MB/s eta 0:00:02
   ------------------- -------------------- 74.4/150.0 MB 40.6 MB/s eta 0:00:02
   --------------------- ------------------ 80.5/150.0 MB 39.5

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import re
from xgboost import XGBClassifier

# Load and preprocess the data
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Load the dataset
df = pd.read_csv('train.csv')
df = df.dropna(subset=['text', 'target', 'disaster_type'])
df['processed_text'] = df['text'].apply(preprocess_text)

# Prepare features (X) and targets (y)
X = df['processed_text']

# Encode target and disaster_type
le_disaster = LabelEncoder()
disaster_types_encoded = le_disaster.fit_transform(df['disaster_type'])
df['target'] = df['target'].astype(int)
y_disaster = df['target']
y_type = disaster_types_encoded

# Split the dataset
X_train, X_test, y_train_disaster, y_test_disaster = train_test_split(X, y_disaster, test_size=0.2, random_state=42)
X_train, X_test, y_train_type, y_test_type = train_test_split(X, y_type, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Define parameter grid for GridSearch
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

# Grid search for disaster classification
xgb_disaster = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
grid_search_disaster = GridSearchCV(xgb_disaster, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_disaster.fit(X_train_vectorized, y_train_disaster)
print("Best Parameters for Disaster Classification:", grid_search_disaster.best_params_)

# Train best model
best_xgb_disaster = grid_search_disaster.best_estimator_
best_xgb_disaster.fit(X_train_vectorized, y_train_disaster)
predictions_disaster = best_xgb_disaster.predict(X_test_vectorized)
print("\nTarget (Disaster/Non-disaster) Classification Report:")
print(classification_report(y_test_disaster, predictions_disaster))

# Grid search for disaster type classification
xgb_type = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
grid_search_type = GridSearchCV(xgb_type, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_type.fit(X_train_vectorized, y_train_type)
print("Best Parameters for Disaster Type Classification:", grid_search_type.best_params_)

# Train best model
best_xgb_type = grid_search_type.best_estimator_
best_xgb_type.fit(X_train_vectorized, y_train_type)
predictions_type = best_xgb_type.predict(X_test_vectorized)
print("\nDisaster Type Classification Report:")
print(classification_report(y_test_type, predictions_type))

# Function to make predictions
def predict_disaster(text):
    try:
        processed = preprocess_text(text)
        vectorized = vectorizer.transform([processed])
        is_disaster = "Disaster" if best_xgb_disaster.predict(vectorized)[0] == 1 else "Not a disaster"
        disaster_type = le_disaster.inverse_transform([best_xgb_type.predict(vectorized)[0]])[0]
        return is_disaster, disaster_type
    except Exception as e:
        print(f"Error in prediction: {e}")
        return "Unknown", "Unknown"

# Example usage
sample_texts = [
    "A huge forest fire has broken out in California",
    "I love the way the sun sets in the evening",
    "Earthquake magnitude 7.2 hits Japan coast"
]

print("\nPredictions for sample texts:")
for text in sample_texts:
    is_disaster, disaster_type = predict_disaster(text)
    print(f"\nText: {text}")
    print(f"Prediction: {is_disaster}")
    print(f"Disaster Type: {disaster_type}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters for Disaster Classification: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Target (Disaster/Non-disaster) Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       780
           1       0.90      0.79      0.84       642

    accuracy                           0.86      1422
   macro avg       0.87      0.86      0.86      1422
weighted avg       0.87      0.86      0.86      1422



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters for Disaster Type Classification: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Disaster Type Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.82      0.88        55
           1       0.98      0.89      0.93       182
           2       0.82      0.96      0.88        24
           3       0.88      0.95      0.91       517
           4       0.99      1.00      1.00       201
           5       0.00      0.00      0.00         9
           6       0.95      0.88      0.91       198
           7       0.94      0.92      0.93       236

    accuracy                           0.92      1422
   macro avg       0.81      0.80      0.81      1422
weighted avg       0.92      0.92      0.92      1422


Predictions for sample texts:

Text: A huge forest fire has broken out in California
Prediction: Disaster
Disaster Type: fire

Text: I love the way the sun sets in the evening
Prediction: Not a disaster
Disaster Type: medical

Text: Earthquake magnitude 7.2 hits Japan coast
Prediction: Disaster
Disaster Type: