In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import re

# Load and preprocess the data
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Load the dataset
df = pd.read_csv('train.csv')
df = df.dropna(subset=['text', 'target', 'disaster_type'])
df['processed_text'] = df['text'].apply(preprocess_text)

# Prepare features (X) and targets (y)
X = df['processed_text']

# Encode target and disaster_type
le_disaster = LabelEncoder()
disaster_types_encoded = le_disaster.fit_transform(df['disaster_type'])
df['target'] = df['target'].astype(int)
y_disaster = df['target']
y_type = disaster_types_encoded

# Split the dataset
X_train, X_test, y_train_disaster, y_test_disaster = train_test_split(X, y_disaster, test_size=0.2, random_state=42)
X_train, X_test, y_train_type, y_test_type = train_test_split(X, y_type, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Define parameter grid for GridSearch
param_grid = {
    'C': [0.1, 1, 10, 100],
    'max_iter': [100, 200, 500]
}

# Grid search for disaster classification
log_reg_disaster = LogisticRegression(random_state=42)
grid_search_disaster = GridSearchCV(log_reg_disaster, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_disaster.fit(X_train_vectorized, y_train_disaster)
print("Best Parameters for Disaster Classification:", grid_search_disaster.best_params_)

# Train best model
best_log_reg_disaster = grid_search_disaster.best_estimator_
best_log_reg_disaster.fit(X_train_vectorized, y_train_disaster)
predictions_disaster = best_log_reg_disaster.predict(X_test_vectorized)
print("\nTarget (Disaster/Non-disaster) Classification Report:")
print(classification_report(y_test_disaster, predictions_disaster))

# Grid search for disaster type classification
log_reg_type = LogisticRegression(random_state=42)
grid_search_type = GridSearchCV(log_reg_type, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_type.fit(X_train_vectorized, y_train_type)
print("Best Parameters for Disaster Type Classification:", grid_search_type.best_params_)

# Train best model
best_log_reg_type = grid_search_type.best_estimator_
best_log_reg_type.fit(X_train_vectorized, y_train_type)
predictions_type = best_log_reg_type.predict(X_test_vectorized)
print("\nDisaster Type Classification Report:")
print(classification_report(y_test_type, predictions_type))

# Function to make predictions
def predict_disaster(text):
    try:
        processed = preprocess_text(text)
        vectorized = vectorizer.transform([processed])
        is_disaster = "Disaster" if best_log_reg_disaster.predict(vectorized)[0] == 1 else "Not a disaster"
        disaster_type = le_disaster.inverse_transform([best_log_reg_type.predict(vectorized)[0]])[0]
        return is_disaster, disaster_type
    except Exception as e:
        print(f"Error in prediction: {e}")
        return "Unknown", "Unknown"

# Example usage
sample_texts = [
    "A huge forest fire has broken out in California",
    "I love the way the sun sets in the evening",
    "Earthquake magnitude 7.2 hits Japan coast"
]

print("\nPredictions for sample texts:")
for text in sample_texts:
    is_disaster, disaster_type = predict_disaster(text)
    print(f"\nText: {text}")
    print(f"Prediction: {is_disaster}")
    print(f"Disaster Type: {disaster_type}")

Best Parameters for Disaster Classification: {'C': 10, 'max_iter': 100}

Target (Disaster/Non-disaster) Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90       780
           1       0.88      0.87      0.87       642

    accuracy                           0.89      1422
   macro avg       0.88      0.88      0.88      1422
weighted avg       0.89      0.89      0.89      1422

Best Parameters for Disaster Type Classification: {'C': 100, 'max_iter': 100}

Disaster Type Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88        55
           1       0.95      0.87      0.91       182
           2       0.90      0.79      0.84        24
           3       0.88      0.95      0.92       517
           4       1.00      1.00      1.00       201
           5       0.00      0.00      0.00         9
           6       0.91      0.87      0.89       1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
