## 1. Download and install necessary packages

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pip install tika

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install number-parser

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install datefinder

Note: you may need to restart the kernel to use updated packages.


## 2. Load the cleaned Haunted Places dataset

In [10]:
import pandas as pd
df = pd.read_csv("../Data/cleaned_haunted_places.tsv", sep='\t')

## 3. Extract features from the "description" field
### 3. a) Audio Evidence

In [12]:
import re

def has_audio_evidence(description):
    audio_keywords = ["noises", "sound of snapping neck", "nursery rhymes"]
    return any(re.search(rf'\b{keyword}\b', description, re.IGNORECASE) for keyword in audio_keywords)

df['Audio Evidence'] = df['description'].apply(has_audio_evidence)

### 3. a) Image/Video/Visual Evidence

In [14]:
def has_visual_evidence(description):
    visual_keywords = ["cameras", "take pictures", "names of children written on walls"]
    return any(re.search(rf'\b{keyword}\b', description, re.IGNORECASE) for keyword in visual_keywords)

df['Image/Video/Visual Evidence'] = df['description'].apply(has_visual_evidence)

### 3. b) Haunted Places Date

In [16]:
import datefinder
import datetime
from datetime import date

def extract_date(description):
    try:
        # Attempt to find dates in the description
        matches = datefinder.find_dates(description)
        
        # Extract the first valid date
        for match in matches:
            return match.date()  # Return only the date part
        
    except Exception:
        # Silently handle errors without printing messages
        pass
    
    # Fallback to '2025-01-01' if no valid date is found or an error occurs
    return datetime.date(2025, 1, 1)

# Apply the function to the 'description' column
df['Haunted Places Date'] = df['description'].apply(extract_date)

In [17]:
df['Haunted Places Date']

0        2025-03-03
1        2025-03-01
2        2025-01-01
3        0211-03-11
4        2025-01-01
            ...    
10969    2025-03-12
10970    2025-01-01
10971    2025-03-18
10972    2025-01-01
10973    2025-01-01
Name: Haunted Places Date, Length: 10974, dtype: object

### 3. c) Haunted Places Witness Count

In [19]:
from number_parser import parse_number

def preprocess_description(description):
    """Replace vague phrases with estimated numbers to aid extraction."""
    description = description.lower()
    # Replace ambiguous phrases with approximate numbers
    replacements = {
        "some": "3",
        "a few": "3",
        "several": "5",
        "many": "10",
        "a lot": "10",
        "a handful": "5",
        "numerous": "10",
        "countless": "15",
        "dozens": "12",
        "scores": "20",
        "hundreds": "100",
        "a couple": "2"
    }
    
    for word, num in replacements.items():
        description = re.sub(rf"\b{word}\b", num, description)  # Whole-word replacement
    return description


def extract_numbers_from_text(text):
    """Extract numerical values from written-out numbers in the text."""
    # Define a regex pattern to match written-out numbers
    number_words = r'\b(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion)\b'
    
    # Find all matches of written-out numbers
    matches = re.findall(number_words, text.lower())
    
    # Parse each match into a numerical value
    numbers = [parse_number(match) for match in matches if parse_number(match) is not None]
    
    # Filter out irrelevant numbers (e.g., years, small numbers)
    filtered_numbers = [num for num in numbers if not (1900 <= num <= 2100)]  # Remove years
    filtered_numbers = [num for num in filtered_numbers if num > 1]  # Ignore small numbers
    
    return filtered_numbers


def extract_witness_count(description):
    """
    Extract witness count from a haunted place description.
    Returns a tuple (witness_count, method) where method indicates how the count was derived.
    """
    try:
        # Step 1: Preprocess the description
        preprocessed_text = preprocess_description(description)
        
        # Step 2: Extract numbers from the text
        numbers = extract_numbers_from_text(preprocessed_text)
        
        # Step 3: Return the first valid number found, or 0 if no numbers are found
        if numbers:
            return numbers[0], "explicit_number"
        
        # Step 4: Default to 0 if no numbers are found
        return 0, "default"
    
    except Exception as e:
        print(f"Error parsing witness count from description: {description[:100]}... Error: {e}")
        return 0, "error"


# Apply the function to the 'description' column
df['Haunted Places Witness Count'] = df['description'].apply(
    lambda desc: extract_witness_count(desc)[0]  # Extract only the count (not the method)
)

# Display the updated columns
print(df[['description', 'Haunted Places Witness Count']])

                                             description  \
0      Ada witch - Sometimes you can see a misty blue...   
1      A little girl was killed suddenly while waitin...   
2      If you take Gorman Rd. west towards Sand Creek...   
3      In the 1970's, one room, room 211, in the old ...   
4      Kappa Delta Sorority - The Kappa Delta Sororit...   
...                                                  ...   
10969  at 12 midnight you can see a lady with two lit...   
10970  Is haunted by the victims of a murder that hap...   
10971  The institution was for kids 18 years old and ...   
10972  Gymnasium -  their have been reports of a litt...   
10973  Cadets from the Air Force Academy participatin...   

       Haunted Places Witness Count  
0                                 0  
1                                 0  
2                                 0  
3                                 2  
4                                 0  
...                             ...  
10969        

### 3. e) Time of Day

In [21]:
def extract_time_of_day(description):
    time_keywords = {"evening": "Evening", "morning": "Morning", "dusk": "Dusk"}
    for keyword, time_of_day in time_keywords.items():
        if re.search(rf'\b{keyword}\b', description, re.IGNORECASE):
            return time_of_day
    return "Unknown"

df['Time of Day'] = df['description'].apply(extract_time_of_day)

### 3. d) Apparition Type

In [23]:
def extract_apparition_type(description):
    apparition_keywords = {
        "ghost": "Ghost",
        "orb": "Orb",
        "ufo": "UFO",
        "uap": "UAP",
        "male": "Male",
        "female": "Female",
        "child": "Child",
        "several ghosts": "Several Ghosts"
    }
    for keyword, apparition_type in apparition_keywords.items():
        if re.search(rf'\b{keyword}\b', description, re.IGNORECASE):
            return apparition_type
    return "Unknown"

df['Apparition Type'] = df['description'].apply(extract_apparition_type)

### 3. g) Event type

In [25]:
def extract_event_type(description):
    event_keywords = {
        "murder": "Murder",
        "die": "Death",
        "supernatural": "Supernatural Phenomenon"
    }
    for keyword, event_type in event_keywords.items():
        if re.search(rf'\b{keyword}\b', description, re.IGNORECASE):
            return event_type
    return "Unknown"

df['Event Type'] = df['description'].apply(extract_event_type)

In [26]:
#to check
visual_evidence_records = df[df['Haunted Places Witness Count'] == 11]

# Display the filtered records
print(visual_evidence_records)

          city        country  \
1518     Lenox  United States   
3390  Woodward  United States   
8509     Paris  United States   

                                            description  \
1518  An eleven year old girl, whose last name is Sl...   
3390  This coffee house was originally a doctor's of...   
8509  Most people get a bad feeling just looking at ...   

                                 location          state state_abbrev  \
1518                     Cranewell Resort  Massachusetts           MA   
3390                     Leos Coffeehouse       Oklahoma           OK   
8509  Old Plantation home in Slate Shoals          Texas           TX   

      longitude   latitude  city_longitude  city_latitude  Audio Evidence  \
1518 -73.267236  42.341822      -73.284876      42.356461           False   
3390 -99.393019  36.434108      -99.390386      36.433648           False   
8509 -95.555513  33.660939      -95.555513      33.660939           False   

      Image/Video/Visual Evi

# 4. Merge the Alcohol Abuse Dataset

In [35]:
#  Haunted Places dataset (TSV format)
df.columns = [col.strip().lower() for col in df.columns]

# Load Alcohol Abuse dataset (appears to be comma-separated)
alcohol_df = pd.read_csv("../Data/alcohol_abuse.tsv", sep=",")
alcohol_df.columns = [col.strip().lower() for col in alcohol_df.columns]

# Rename the Alcohol Abuse column to ensure it has a common key ("state")
if "state" not in alcohol_df.columns and "state_name" in alcohol_df.columns:
    alcohol_df.rename(columns={"state_name": "state"}, inplace=True)

# Check that both DataFrames have the 'state' column
if "state" not in df.columns or "state" not in alcohol_df.columns:
    raise KeyError("The 'state' column is missing from one of the datasets.")

# Merge the datasets on the 'state' column using a left join
merged_df = pd.merge(df, alcohol_df, on="state", how="left")

# Save the merged dataset as a TSV file
merged_df.to_csv("../Data/haunted_places_with_alcohol.tsv", sep="\t", index=False)
print("Merge completed: {} rows merged.".format(merged_df.shape[0]))

merged_df

Merge completed: 10974 rows merged.


Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,...,death ratio (1 per x adults),deaths per 10k adults,% male deaths,% chronic causes deaths,% deaths in adults 35+,% under 21 deaths,cdc years of potential life lost,taxpayer spending 2010 (billion usd),adjusted spending 2022 (billion usd),cost per drink (usd)
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.495480,42.960727,...,2216.0,5.75,67.9,59.6,84.44,2.57,115890.0,8.162,11.018,2.84
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,...,2216.0,5.75,67.9,59.6,84.44,2.57,115890.0,8.162,11.018,2.84
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,...,2216.0,5.75,67.9,59.6,84.44,2.57,115890.0,8.162,11.018,2.84
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,...,2216.0,5.75,67.9,59.6,84.44,2.57,115890.0,8.162,11.018,2.84
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.753030,42.243097,...,2216.0,5.75,67.9,59.6,84.44,2.57,115890.0,8.162,11.018,2.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10969,Westminster,United States,at 12 midnight you can see a lady with two lit...,city hall,Colorado,CO,-105.048936,39.862610,-105.037205,39.836653,...,2201.0,5.82,67.7,61.3,83.10,2.78,71300.0,5.057,6.826,2.89
10970,Westminster,United States,Is haunted by the victims of a murder that hap...,Pillar of Fire,Colorado,CO,-105.032091,39.847237,-105.037205,39.836653,...,2201.0,5.82,67.7,61.3,83.10,2.78,71300.0,5.057,6.826,2.89
10971,Wheat Ridge,United States,The institution was for kids 18 years old and ...,Ridge Mental Institution,Colorado,CO,-105.063974,39.769726,-105.077206,39.766098,...,2201.0,5.82,67.7,61.3,83.10,2.78,71300.0,5.057,6.826,2.89
10972,Wheat Ridge,United States,Gymnasium - their have been reports of a litt...,Wheat Ridge Middle School,Colorado,CO,-105.103613,39.764055,-105.077206,39.766098,...,2201.0,5.82,67.7,61.3,83.10,2.78,71300.0,5.057,6.826,2.89


# 5 Merge the Daylight Dataset
### 5. a) Clean the Daylight Dataset

In [38]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [39]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd

# Set up Selenium to run headlessly (no browser window)
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)  # Ensure chromedriver is in your PATH

# URL for the daylight data page (modify if needed)
url = "https://www.timeanddate.com/astronomy/usa"
driver.get(url)
driver.implicitly_wait(10)  # Wait for dynamic content to load

# Get the full page source after dynamic content has loaded
html = driver.page_source
driver.quit()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Find all table elements on the page
tables = soup.find_all("table")
print("Number of tables found:", len(tables))

# Extract each table into a DataFrame and combine them
dfs = [pd.read_html(str(table))[0] for table in tables]
if len(dfs) > 1:
    full_df = pd.concat(dfs, ignore_index=True)
else:
    full_df = dfs[0]

# Save the full dataset as a TSV file
full_df.to_csv("../Data/daylight_hours_full.tsv", sep="\t", index=False)
print("Full daylight data saved to daylight_hours_full.tsv")

Number of tables found: 3
Full daylight data saved to daylight_hours_full.tsv


**Extract only the state codes and create a separate State column.**

In [44]:
# Load the dataset
file_path = "../Data/daylight.csv"
daylight_df = pd.read_csv(file_path)

# Extract state codes from the location column
daylight_df["State"] = daylight_df.iloc[:, 0].str.extract(r'\((\w+)\)')

# Remove the original location column
daylight_df.drop(columns=[daylight_df.columns[0]], inplace=True)

# Display the updated dataset
print(daylight_df.head())


  Sunrise in United States Sunset in United States State
0                  8:26 AM                 7:31 PM    AK
1                  6:25 AM                 5:48 PM    NY
2                  6:32 AM                 6:04 PM    NM
3                  6:44 AM                 6:08 PM    IA
4                  7:52 AM                 6:31 PM    AK


In [46]:
import datetime

# Function to convert time to 24-hour float format
def convert_to_hours(time_str):
    if pd.isna(time_str) or time_str is None:  # Avoid NaN issues
        return None
    try:
        time_str = time_str.strip()  # Remove extra spaces
        return datetime.datetime.strptime(time_str, "%I:%M %p").hour + \
               datetime.datetime.strptime(time_str, "%I:%M %p").minute / 60
    except ValueError:
        return None  # Return None if format is incorrect

# Apply conversion to all "Sunrise" and "Sunset" columns
for col in daylight_df.columns:
    if "Sunrise" in col or "Sunset" in col:
        daylight_df[col] = daylight_df[col].astype(str).apply(convert_to_hours)

print(daylight_df.head())


   Sunrise in United States  Sunset in United States State
0                  8.433333                19.516667    AK
1                  6.416667                17.800000    NY
2                  6.533333                18.066667    NM
3                  6.733333                18.133333    IA
4                  7.866667                18.516667    AK


In [48]:
# Calculate daylight duration for each city
daylight_df["Daylight_Hours"] = daylight_df["Sunset in United States"] - daylight_df["Sunrise in United States"]

# Compute the average daylight hours per state
state_daylight_df = daylight_df.groupby("State")["Daylight_Hours"].mean().reset_index()

# Rename the column
state_daylight_df.rename(columns={"Daylight_Hours": "Avg_Daylight_Hours"}, inplace=True)

# Display results
print(state_daylight_df.head())


  State  Avg_Daylight_Hours
0    AK           10.790000
1    AL           11.616667
2    AR           11.550000
3    AZ           11.566667
4    CA           11.500000


In [50]:
# Compute sunrise variability (std deviation of sunrise times per state)
state_daylight_df["Sunrise_Variability"] = daylight_df.groupby("State")["Sunrise in United States"].std().reset_index(drop=True)

# If a state has only one city, set Sunrise_Variability to 0
state_daylight_df["Sunrise_Variability"] = state_daylight_df["Sunrise_Variability"].fillna(0)

print(state_daylight_df.head())

  State  Avg_Daylight_Hours  Sunrise_Variability
0    AK           10.790000             0.776620
1    AL           11.616667             0.000000
2    AR           11.550000             0.000000
3    AZ           11.566667             0.000000
4    CA           11.500000             0.184316


In [52]:
# Compute the range of daylight hours per state
state_daylight_df["Daylight_Hours_Range"] = daylight_df.groupby("State")["Daylight_Hours"].apply(lambda x: x.max() - x.min()).reset_index(drop=True)

# Display results
state_daylight_df


Unnamed: 0,State,Avg_Daylight_Hours,Sunrise_Variability,Daylight_Hours_Range
0,AK,10.79,0.77662,0.6833333
1,AL,11.616667,0.0,0.0
2,AR,11.55,0.0,0.0
3,AZ,11.566667,0.0,0.0
4,CA,11.5,0.184316,0.1166667
5,CO,11.416667,0.0,0.0
6,CT,11.416667,0.0,0.0
7,DC,11.483333,0.0,0.0
8,FL,11.691667,0.223917,0.08333333
9,GA,11.583333,0.0,0.0


In [59]:
state_daylight_df.to_csv("../Data/daylight_cleaned.csv", index=False)

### 5. b) Merge the Daylight Dataset

In [62]:
import pandas as pd

# Load state_daylight_df and haunted_places_with_alcohol.tsv
state_daylight_file = "../Data/daylight_cleaned.csv"
haunted_alcohol_file = "../Data/haunted_places_with_alcohol.tsv"

state_daylight_df = pd.read_csv(state_daylight_file)
merged_df = pd.read_csv(haunted_alcohol_file, sep="\t")

# Ensure daylight_df uses "state_abbrev" to match merged_df
state_daylight_df.rename(columns={"State": "state_abbrev"}, inplace=True)

# Ensure `state_abbrev` formatting is consistent (uppercase & stripped spaces)
state_daylight_df["state_abbrev"] = state_daylight_df["state_abbrev"].str.upper().str.strip()
merged_df["state_abbrev"] = merged_df["state_abbrev"].str.upper().str.strip()

# Select Only the Needed Columns from `state_daylight_df`
# Ensure only the correct daylight columns are merged
daylight_columns = ["state_abbrev", "Avg_Daylight_Hours", "Sunrise_Variability", "Daylight_Hours_Range"]
state_daylight_df = state_daylight_df[daylight_columns]

# Merge datasets using `state_abbrev`, placing daylight data at the rightmost columns
merged_df = merged_df.merge(state_daylight_df, on="state_abbrev", how="left")

# Save and Preview the Final Dataset
merged_df.to_csv("../Data/haunted_places_with_alcohol_daylight.tsv", sep="\t", index=False)
print("✅ Final dataset saved as 'haunted_places_with_alcohol_daylight.tsv'.")

merged_df


✅ Final dataset saved as 'haunted_places_with_alcohol_daylight.tsv'.


Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,...,% chronic causes deaths,% deaths in adults 35+,% under 21 deaths,cdc years of potential life lost,taxpayer spending 2010 (billion usd),adjusted spending 2022 (billion usd),cost per drink (usd),Avg_Daylight_Hours,Sunrise_Variability,Daylight_Hours_Range
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.495480,42.960727,...,59.6,84.44,2.57,115890.0,8.162,11.018,2.84,11.391667,0.070711,0.016667
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,...,59.6,84.44,2.57,115890.0,8.162,11.018,2.84,11.391667,0.070711,0.016667
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,...,59.6,84.44,2.57,115890.0,8.162,11.018,2.84,11.391667,0.070711,0.016667
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,...,59.6,84.44,2.57,115890.0,8.162,11.018,2.84,11.391667,0.070711,0.016667
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.753030,42.243097,...,59.6,84.44,2.57,115890.0,8.162,11.018,2.84,11.391667,0.070711,0.016667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10969,Westminster,United States,at 12 midnight you can see a lady with two lit...,city hall,Colorado,CO,-105.048936,39.862610,-105.037205,39.836653,...,61.3,83.10,2.78,71300.0,5.057,6.826,2.89,11.416667,0.000000,0.000000
10970,Westminster,United States,Is haunted by the victims of a murder that hap...,Pillar of Fire,Colorado,CO,-105.032091,39.847237,-105.037205,39.836653,...,61.3,83.10,2.78,71300.0,5.057,6.826,2.89,11.416667,0.000000,0.000000
10971,Wheat Ridge,United States,The institution was for kids 18 years old and ...,Ridge Mental Institution,Colorado,CO,-105.063974,39.769726,-105.077206,39.766098,...,61.3,83.10,2.78,71300.0,5.057,6.826,2.89,11.416667,0.000000,0.000000
10972,Wheat Ridge,United States,Gymnasium - their have been reports of a litt...,Wheat Ridge Middle School,Colorado,CO,-105.103613,39.764055,-105.077206,39.766098,...,61.3,83.10,2.78,71300.0,5.057,6.826,2.89,11.416667,0.000000,0.000000
