In [8]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from meteostat import Point, Daily
from geopy.geocoders import Nominatim
from datetime import timedelta

In [2]:
# --- Load Dataset ---
# Import the dataset uploaded to the repo
url = "https://raw.githubusercontent.com/MaharLeika18/Data-Mining---Python/refs/heads/main/Final_Exam/Raw%20Data.csv"
data = pd.read_csv(url)

print("Data loaded successfully!")
print(data.head())

Data loaded successfully!
   Order Date  Customer Name         State         Category Sub-Category  \
0  03-01-2014  Darren Powers         Texas  Office Supplies        Paper   
1  04-01-2014  Phillina Ober      Illinois  Office Supplies       Labels   
2  04-01-2014  Phillina Ober      Illinois  Office Supplies      Storage   
3  04-01-2014  Phillina Ober      Illinois  Office Supplies      Binders   
4  05-01-2014     Mick Brown  Pennsylvania  Office Supplies          Art   

                                        Product Name   Sales  Quantity  Profit  
0  Message Book, Wirebound, Four 5 1/2" X 4" Form...   16.45         2    5.55  
1                                          Avery 508   11.78         3    4.27  
2                      SAFCO Boltless Steel Shelving  272.74         3  -64.77  
3         GBC Standard Plastic Binding Systems Combs    3.54         2   -5.49  
4  Avery Hi-Liter EverBold Pen Style Fluorescent ...   19.54         3    4.88  


In [4]:
# --- Data Preprocessing ---
# Create id column
data['Transaction ID'] = data.index + 1

# Convert dates to datetime
data['Order Date'] = pd.to_datetime(data['Order Date'], format='%d-%m-%Y')
start = data['Order Date'].min()
end = data['Order Date'].max()
print(f"Date range: {start.date()} to {end.date()}")

# Convert states to coordinates
# LOUE NOTE: FULL DISCLAIMER I had chatgpt gen these coords cuz no way am i doing this manually.
# so if the cords are innacurate im sorry
# or well chatgpt should be sorry but unlike caine its not sentient

state_coords = {
    "Alabama": (32.806671, -86.791130),
    "Alaska": (61.370716, -152.404419),
    "Arizona": (33.729759, -111.431221),
    "Arkansas": (34.969704, -92.373123),
    "California": (36.116203, -119.681564),
    "Colorado": (39.059811, -105.311104),
    "Connecticut": (41.597782, -72.755371),
    "Delaware": (39.318523, -75.507141),
    "Florida": (27.766279, -81.686783),
    "Georgia": (33.040619, -83.643074),
    "Hawaii": (21.094318, -157.498337),
    "Idaho": (44.240459, -114.478828),
    "Illinois": (40.349457, -88.986137),
    "Indiana": (39.849426, -86.258278),
    "Iowa": (42.011539, -93.210526),
    "Kansas": (38.526600, -96.726486),
    "Kentucky": (37.668140, -84.670067),
    "Louisiana": (31.169546, -91.867805),
    "Maine": (44.693947, -69.381927),
    "Maryland": (39.063946, -76.802101),
    "Massachusetts": (42.230171, -71.530106),
    "Michigan": (43.326618, -84.536095),
    "Minnesota": (45.694454, -93.900192),
    "Mississippi": (32.741646, -89.678696),
    "Missouri": (38.456085, -92.288368),
    "Montana": (46.921925, -110.454353),
    "Nebraska": (41.125370, -98.268082),
    "Nevada": (38.313515, -117.055374),
    "New Hampshire": (43.452492, -71.563896),
    "New Jersey": (40.298904, -74.521011),
    "New Mexico": (34.840515, -106.248482),
    "New York": (42.165726, -74.948051),
    "North Carolina": (35.630066, -79.806419),
    "North Dakota": (47.528912, -99.784012),
    "Ohio": (40.388783, -82.764915),
    "Oklahoma": (35.565342, -96.928917),
    "Oregon": (44.572021, -122.070938),
    "Pennsylvania": (40.590752, -77.209755),
    "Rhode Island": (41.680893, -71.511780),
    "South Carolina": (33.856892, -80.945007),
    "South Dakota": (44.299782, -99.438828),
    "Tennessee": (35.747845, -86.692345),
    "Texas": (31.054487, -97.563461),
    "Utah": (40.150032, -111.862434),
    "Vermont": (44.045876, -72.710686),
    "Virginia": (37.769337, -78.169968),
    "Washington": (47.400902, -121.490494),
    "West Virginia": (38.491226, -80.954456),
    "Wisconsin": (44.268543, -89.616508),
    "Wyoming": (42.755966, -107.302490)
}

data['Coords'] = data['State'].map(state_coords)
data[['Latitude', 'Longitude']] = pd.DataFrame(data['Coords'].tolist(), index=data.index)

data.head()

Date range: 2014-01-03 to 2017-12-30


Unnamed: 0,Order Date,Customer Name,State,Category,Sub-Category,Product Name,Sales,Quantity,Profit,Transaction ID,Coords,Latitude,Longitude
0,2014-01-03,Darren Powers,Texas,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.45,2,5.55,1,"(31.054487, -97.563461)",31.054487,-97.563461
1,2014-01-04,Phillina Ober,Illinois,Office Supplies,Labels,Avery 508,11.78,3,4.27,2,"(40.349457, -88.986137)",40.349457,-88.986137
2,2014-01-04,Phillina Ober,Illinois,Office Supplies,Storage,SAFCO Boltless Steel Shelving,272.74,3,-64.77,3,"(40.349457, -88.986137)",40.349457,-88.986137
3,2014-01-04,Phillina Ober,Illinois,Office Supplies,Binders,GBC Standard Plastic Binding Systems Combs,3.54,2,-5.49,4,"(40.349457, -88.986137)",40.349457,-88.986137
4,2014-01-05,Mick Brown,Pennsylvania,Office Supplies,Art,Avery Hi-Liter EverBold Pen Style Fluorescent ...,19.54,3,4.88,5,"(40.590752, -77.209755)",40.590752,-77.209755


In [11]:
# Get historical weather data for each state
weather_records = []

for i, row in data.iterrows():
    lat, lon = row['Latitude'], row['Longitude']
    location = Point(lat, lon)

    try:
        w = Daily(location, start, end).fetch()
        if not w.empty:
            w = w.reset_index()
            w['Latitude'] = lat
            w['Longitude'] = lon
            weather_records.append(w)
    except Exception as e:
        print(f"Failed to fetch for {lat}, {lon} on {start}, {end}: {e}")
        continue

weather_data = pd.concat(weather_records, ignore_index=True)

print(f"Shape: {weather_data.shape}")
display(weather_data[['Order Date', 'tavg', 'prcp', 'wspd']].head())



KeyboardInterrupt: 

In [None]:
# Get historical weather data for each state (DELETE WHEN ABOVE CELL WORKS)
location = Point(14.5995, 120.9842, 70)

weather_data = Daily(location, start, end)
weather_data = weather_data.fetch()

weather_data.head()


Unnamed: 0,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun


In [None]:
# Add description to the data
def get_weather_description(row):
    temp = row['tavg']  # average temperature
    precip = row['prcp']  # precipitation
    
    if pd.isna(temp) or pd.isna(precip):
        return "Unknown"
    
    if precip > 10: 
        return "Heavy Rain"
    elif precip > 5:
        return "Rainy"
    elif precip > 1:
        return "Light Rain" 
    elif temp > 29:
        return "Sunny"
    elif temp > 27:
        return "Partly Cloudy"
    else: 
        return "Warm and Humid"

# Apply weather descriptions
weather_data['weather'] = weather_data.apply(get_weather_description, axis=1)

print("Weather text descriptions:")
print(weather_data['weather'].value_counts())
weather_data[['tavg', 'prcp', 'weather']].head(10)

In [None]:
# Create weather_data_reset for merging
weather_data_reset = weather_data.reset_index()
weather_data_reset.rename(columns={'time': 'Order Date'}, inplace=True)

print(f"Shape: {weather_data_reset.shape}")
display(weather_data_reset[['Order Date', 'weather']].head())

display(weather_data_reset)

# Merge 
data_with_weather = pd.merge(data, weather_data_reset[['Order Date','weather','tavg','prcp','wspd']], on='Order Date', how='left')

In [None]:
# Export
data_with_weather.to_csv('retail_data_with_weather.csv', index=False)
display(data_with_weather)

In [10]:
# --- One-hot encode categorical data ---
# LOUE NOTE: Do this immediately before tha association analysis
# Or merge df with data because df contains only the one-hot encoded data

transactions = data.groupby('Transaction ID')['Category'].apply(list).values.tolist()
transactions = data.groupby('Transaction ID')['Sub-Category'].apply(list).values.tolist()

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

df.head()

Unnamed: 0,Accessories,Appliances,Art,Binders,Bookcases,Chairs,Copiers,Envelopes,Fasteners,Furnishings,Labels,Machines,Paper,Phones,Storage,Supplies,Tables
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
