In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('glo.csv', encoding='ISO-8859-1', low_memory=False)

# =====================================================================
# 1. Handle Missing Values
# =====================================================================

# List of columns to keep based on your specification
columns_to_keep = [
    'eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
    'resolution', 'country', 'country_txt', 'region', 'region_txt',
    'provstate', 'city', 'latitude', 'longitude', 'specificity',
    'vicinity', 'location', 'crit1', 'crit2', 'crit3', 'doubtterr',
    'alternative', 'alternative_txt', 'multiple', 'success', 'suicide',
    'attacktype1', 'attacktype1_txt', 'attacktype2', 'attacktype2_txt',
    'attacktype3', 'attacktype3_txt', 'targtype1', 'targtype1_txt',
    'targsubtype1', 'targsubtype1_txt', 'corp1', 'target1', 'natlty1',
    'natlty1_txt'
]

# Filter only the specified columns
df = df[columns_to_keep]

# Drop columns with >70% missing values
threshold = len(df) * 0.30  # Keep columns with at least 30% data
df = df.dropna(thresh=threshold, axis=1)

# =====================================================================
# 2. Handle Date-related Columns
# =====================================================================

# Fix invalid dates
df['iday'] = df['iday'].replace(0, 1)
df['imonth'] = df['imonth'].replace(0, 1)

# Create date column
df['date'] = pd.to_datetime(
    df['iyear'].astype(str) + '-' + 
    df['imonth'].astype(str) + '-' + 
    df['iday'].astype(str),
    errors='coerce'
)

# =====================================================================
# 3. Clean Categorical Data
# =====================================================================

categorical_cols = [
    'country_txt', 'region_txt', 'provstate', 'city',
    'attacktype1_txt',
    'targtype1_txt', 'targsubtype1_txt', 'natlty1_txt'
]

for col in categorical_cols:
    # Fill NA and convert to category
    df[col] = df[col].fillna('Unknown').astype('category')
    
# =====================================================================
# 4. Clean Numerical Data
# =====================================================================

numerical_cols = [
    'latitude', 'longitude', 'specificity',
    'crit1', 'crit2', 'crit3', 'multiple', 'success', 'suicide'
]

for col in numerical_cols:
    # Fill NA with 0 for binary/coordinate columns
    df[col] = df[col].fillna(0).astype(float)

# Handle geographical outliers
df['latitude'] = df['latitude'].clip(-90, 90)
df['longitude'] = df['longitude'].clip(-180, 180)

# =====================================================================
# 5. Clean Text Data
# =====================================================================

text_cols = ['target1', 'corp1', 'location']
for col in text_cols:
    df[col] = df[col].str.strip().str.title().replace(
        ['Unknown', 'Unknown Group'], np.nan
    )

# =====================================================================
# 6. Handle Special Values
# =====================================================================

# Replace placeholder values
df.replace({
    -9: np.nan,
    -99: np.nan,
    -999: np.nan
}, inplace=True)

# =====================================================================
# 7. Final Cleaning
# =====================================================================

# Drop low-value columns
cols_to_drop = [
    'approxdate', 'resolution', 'doubtterr',
    'alternative', 'alternative_txt', 'natlty1'
]
df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

# Remove duplicates
df.drop_duplicates(subset=['eventid'], keep='first', inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)

# Save cleaned data
df.to_csv('cleaned_terrorism_data.csv', index=False)

print("Cleaning complete!")
print(f"Original columns: {len(columns_to_keep)}")
print(f"Final columns: {len(df.columns)}")
print(f"Remaining columns:\n{df.columns.tolist()}")

Cleaning complete!
Original columns: 41


KeyError: 0

In [None]:
# import pandas as pd
# import numpy as np

# # Load the dataset
# df = pd.read_csv('glo.csv', encoding='ISO-8859-1', low_memory=False)

# # =====================================================================
# # 1. Handle Missing Values
# # =====================================================================

# # Drop columns with >60% missing values
# threshold = len(df) * 0.60
# df = df.dropna(thresh=threshold, axis=1)

# # Fill categorical missing values
# categorical_cols = ['region_txt', 'country_txt', 'attacktype1_txt', 'targtype1_txt']
# for col in categorical_cols:
#     df.fillna({col:'Unknown'}, inplace=True)

# # Fill numerical missing values
# numerical_cols = ['nkill', 'nwound', 'latitude', 'longitude']
# for col in numerical_cols:
#     df.fillna({col:0}, inplace=True)

# # Handle date-related columns
# df['iday'] = df['iday'].replace(0, 1)  # Replace 0 days with 1
# df['imonth'] = df['imonth'].replace(0, 1)  # Replace 0 months with 1

# # =====================================================================
# # 2. Data Type Conversion
# # =====================================================================

# # Convert dates to datetime format
# date_cols = ['iyear', 'imonth', 'iday']
# df['date'] = pd.to_datetime(df[date_cols].astype(str).agg('-'.join, axis=1), errors='coerce')

# # Convert categoricals to proper type
# df[categorical_cols] = df[categorical_cols].astype('category')

# # =====================================================================
# # 3. Handle Duplicates
# # =====================================================================

# # Remove exact duplicates
# df.drop_duplicates(subset=['eventid'], keep='first', inplace=True)

# # =====================================================================
# # 4. Standardize Text Data
# # =====================================================================

# # Clean text columns
# text_cols = ['city', 'target1', 'targsubtype1_txt','corp1']
# for col in text_cols:
#     df[col] = df[col].str.strip().str.title().replace('Unknown', np.nan)

# # =====================================================================
# # 5. Handle Special Values
# # =====================================================================

# # Replace placeholder values
# df.replace({-9: np.nan, -99: np.nan}, inplace=True)

# # Replace text placeholders only in text columns (not categorical columns)
# text_cols = ['city', 'target1', 'targsubtype1_txt', 'corp1']
# text_replacements = {'Unknown': np.nan, 'Unknown Group': np.nan}
# df[text_cols] = df[text_cols].replace(text_replacements)
# # =====================================================================
# # 6. Feature Engineering
# # =====================================================================

# # # Create casualty feature
# # df['casualties'] = df['nkill'] + df['nwound']

# # # Create decade feature
# # df['decade'] = (df['iyear'] // 10) * 10

# # =====================================================================
# # 7. Outlier Handling
# # =====================================================================

# # Handle geographical outliers
# df['latitude'] = df['latitude'].clip(-90, 90)
# df['longitude'] = df['longitude'].clip(-180, 180)

# # Cap casualty numbers at 99th percentile
# casualty_cap = df['casualties'].quantile(0.99)
# df['casualties'] = np.where(df['casualties'] > casualty_cap, casualty_cap, df['casualties'])

# # =====================================================================
# # 8. Final Cleaning
# # =====================================================================

# # Drop unnecessary columns
# cols_to_drop = ['approxdate', 'resolution', 'doubtterr', 'alternative', 'related']
# df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

# # Reset index
# df.reset_index(drop=True, inplace=True)

# # Save cleaned data
# df.to_csv('cleaned_global_terrorism.csv', index=False)

# print("Data cleaning complete!")
# print(f"Original shape: {df.shape}")
# print(f"Cleaned shape: {df.shape}")

KeyError: 'casualties'

In [6]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('glo.csv', encoding='ISO-8859-1', low_memory=False)

# =====================================================================
# 1. Handle Missing Values
# =====================================================================
threshold = len(df) * 0.60
df = df.dropna(thresh=threshold, axis=1)

categorical_cols = ['region_txt', 'country_txt', 'attacktype1_txt', 'targtype1_txt']
for col in categorical_cols:
    df[col].fillna('Unknown', inplace=True)

numerical_cols = ['nkill', 'nwound', 'latitude', 'longitude']
for col in numerical_cols:
    df[col].fillna(0, inplace=True)

df['iday'] = df['iday'].replace(0, 1)
df['imonth'] = df['imonth'].replace(0, 1)

# =====================================================================
# 2. Data Type Conversion
# =====================================================================
date_cols = ['iyear', 'imonth', 'iday']
df['date'] = pd.to_datetime(df[date_cols].astype(str).agg('-'.join, axis=1), errors='coerce')
df[categorical_cols] = df[categorical_cols].astype('category')

# =====================================================================
# 3. Handle Duplicates
# =====================================================================
df.drop_duplicates(subset=['eventid'], keep='first', inplace=True)

# =====================================================================
# 4. Standardize Text Data
# =====================================================================
text_cols = ['city', 'target1', 'targsubtype1_txt', 'corp1']
for col in text_cols:
    df[col] = df[col].str.strip().str.title().replace('Unknown', np.nan)

# =====================================================================
# 5. Handle Special Values (Corrected Section)
# =====================================================================
df.replace({-9: np.nan, -99: np.nan}, inplace=True)  # Numerical replacements
text_replacements = {'Unknown': np.nan, 'Unknown Group': np.nan}
df[text_cols] = df[text_cols].replace(text_replacements)  # Text column replacements

# =====================================================================
# 6-8. Remaining Steps (Unchanged)
# =====================================================================
df['casualties'] = df['nkill'] + df['nwound']
df['decade'] = (df['iyear'] // 10) * 10

df['latitude'] = df['latitude'].clip(-90, 90)
df['longitude'] = df['longitude'].clip(-180, 180)

casualty_cap = df['casualties'].quantile(0.99)
df['casualties'] = np.where(df['casualties'] > casualty_cap, casualty_cap, df['casualties'])

cols_to_drop = ['approxdate', 'resolution', 'doubtterr', 'alternative', 'related']
df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

df.reset_index(drop=True, inplace=True)
df.to_csv('cleaned_global_terrorism.csv', index=False)

print("Data cleaning complete!")
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df.shape}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


KeyError: 'nkill'