# EM-DAT Data Cleanup 

CSCI 4622 | Introduction to Machine Learning
Team Disaster | 

This notebook is resposible for the cleanup process for the EM-DAT database. 

## Step 1: Data Acquisition

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
file_path = 'your_emdat_file.csv'  # Replace with the actual file path
data = pd.read_csv(file_path)
data.head()

## Step 2: Data Understanding

In [None]:
# Basic data information
data.info()

# Display basic statistics
data.describe(include='all')

## Step 3: Data Cleaning

### Handling Missing Values

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

### Removing Duplicates

In [None]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f'Duplicates: {duplicates}')

# Remove duplicates
data = data.drop_duplicates()
print(f'Data shape after removing duplicates: {data.shape}')

### Correcting Inconsistencies

In [None]:
# Example: Convert all date columns to a standard format
date_columns = ['Start_Date', 'End_Date']  # Replace with actual date columns
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

# Check data types
data.dtypes

### Outlier Detection

In [None]:
# Identify and examine potential outliers
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting a boxplot to identify outliers in key numeric columns
plt.figure(figsize=(10, 6))
sns.boxplot(data=data.select_dtypes(include='number'))
plt.xticks(rotation=90)
plt.show()

## Step 4: Data Transformation

### Feature Engineering

In [None]:
# Example: Calculate disaster duration in days
data['Disaster_Duration'] = (data['End_Date'] - data['Start_Date']).dt.days
data[['Start_Date', 'End_Date', 'Disaster_Duration']].head()

### Encoding Categorical Variables

In [None]:
# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['Disaster_Type', 'Region'], drop_first=True)
data.head()

### Scaling and Normalization

In [None]:
# Standardize numerical columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_columns = data.select_dtypes(include='number').columns
data[num_columns] = scaler.fit_transform(data[num_columns])
data.head()

## Step 5: Data Integration

In [None]:
# Integrate with additional datasets (example placeholder)
# additional_data = pd.read_csv('additional_data.csv')
# data = data.merge(additional_data, on='Common_Column')

## Step 6: Data Splitting

In [None]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split

X = data.drop('Target_Variable', axis=1)  # Replace with your target variable
y = data['Target_Variable']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training set size: {X_train.shape}')
print(f'Test set size: {X_test.shape}')

## Step 7: Data Validation

In [None]:
# Exploratory data analysis (EDA) to ensure data readiness
sns.pairplot(data.sample(100))
plt.show()