In [37]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from meteostat import Point, Daily
from geopy.geocoders import Nominatim
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [38]:
# --- Load Dataset ---
# Import the dataset uploaded to the repo
url = "https://raw.githubusercontent.com/MaharLeika18/Data-Mining---Python/refs/heads/main/Final_Exam/Raw%20Data.csv"
data = pd.read_csv(url)

print("Data loaded successfully!")
print(data.head())

Data loaded successfully!
   Order Date  Customer Name         State         Category Sub-Category  \
0  03-01-2014  Darren Powers         Texas  Office Supplies        Paper   
1  04-01-2014  Phillina Ober      Illinois  Office Supplies       Labels   
2  04-01-2014  Phillina Ober      Illinois  Office Supplies      Storage   
3  04-01-2014  Phillina Ober      Illinois  Office Supplies      Binders   
4  05-01-2014     Mick Brown  Pennsylvania  Office Supplies          Art   

                                        Product Name   Sales  Quantity  Profit  
0  Message Book, Wirebound, Four 5 1/2" X 4" Form...   16.45         2    5.55  
1                                          Avery 508   11.78         3    4.27  
2                      SAFCO Boltless Steel Shelving  272.74         3  -64.77  
3         GBC Standard Plastic Binding Systems Combs    3.54         2   -5.49  
4  Avery Hi-Liter EverBold Pen Style Fluorescent ...   19.54         3    4.88  


In [39]:
# --- Data Preprocessing ---
# Create id column
data['Transaction ID'] = data.index + 1

# Convert dates to datetime
data['Order Date'] = pd.to_datetime(data['Order Date'], format='%d-%m-%Y')
start = data['Order Date'].min()
end = data['Order Date'].max()
print(f"Date range: {start.date()} to {end.date()}")

# Convert states to coordinates
# LOUE NOTE: FULL DISCLAIMER I had chatgpt gen these coords cuz no way am i doing this manually.
# so if the cords are innacurate im sorry
# or well chatgpt should be sorry but unlike caine its not sentient

state_coords = {
    "Alabama": (32.806671, -86.791130),
    "Alaska": (61.370716, -152.404419),
    "Arizona": (33.729759, -111.431221),
    "Arkansas": (34.969704, -92.373123),
    "California": (36.116203, -119.681564),
    "Colorado": (39.059811, -105.311104),
    "Connecticut": (41.597782, -72.755371),
    "Delaware": (39.318523, -75.507141),
    "Florida": (27.766279, -81.686783),
    "Georgia": (33.040619, -83.643074),
    "Hawaii": (21.094318, -157.498337),
    "Idaho": (44.240459, -114.478828),
    "Illinois": (40.349457, -88.986137),
    "Indiana": (39.849426, -86.258278),
    "Iowa": (42.011539, -93.210526),
    "Kansas": (38.526600, -96.726486),
    "Kentucky": (37.668140, -84.670067),
    "Louisiana": (31.169546, -91.867805),
    "Maine": (44.693947, -69.381927),
    "Maryland": (39.063946, -76.802101),
    "Massachusetts": (42.230171, -71.530106),
    "Michigan": (43.326618, -84.536095),
    "Minnesota": (45.694454, -93.900192),
    "Mississippi": (32.741646, -89.678696),
    "Missouri": (38.456085, -92.288368),
    "Montana": (46.921925, -110.454353),
    "Nebraska": (41.125370, -98.268082),
    "Nevada": (38.313515, -117.055374),
    "New Hampshire": (43.452492, -71.563896),
    "New Jersey": (40.298904, -74.521011),
    "New Mexico": (34.840515, -106.248482),
    "New York": (42.165726, -74.948051),
    "North Carolina": (35.630066, -79.806419),
    "North Dakota": (47.528912, -99.784012),
    "Ohio": (40.388783, -82.764915),
    "Oklahoma": (35.565342, -96.928917),
    "Oregon": (44.572021, -122.070938),
    "Pennsylvania": (40.590752, -77.209755),
    "Rhode Island": (41.680893, -71.511780),
    "South Carolina": (33.856892, -80.945007),
    "South Dakota": (44.299782, -99.438828),
    "Tennessee": (35.747845, -86.692345),
    "Texas": (31.054487, -97.563461),
    "Utah": (40.150032, -111.862434),
    "Vermont": (44.045876, -72.710686),
    "Virginia": (37.769337, -78.169968),
    "Washington": (47.400902, -121.490494),
    "West Virginia": (38.491226, -80.954456),
    "Wisconsin": (44.268543, -89.616508),
    "Wyoming": (42.755966, -107.302490)
}

data['Coords'] = data['State'].map(state_coords)
data[['Latitude', 'Longitude']] = pd.DataFrame(data['Coords'].tolist(), index=data.index)

data.head()

Date range: 2014-01-03 to 2017-12-30


Unnamed: 0,Order Date,Customer Name,State,Category,Sub-Category,Product Name,Sales,Quantity,Profit,Transaction ID,Coords,Latitude,Longitude
0,2014-01-03,Darren Powers,Texas,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.45,2,5.55,1,"(31.054487, -97.563461)",31.054487,-97.563461
1,2014-01-04,Phillina Ober,Illinois,Office Supplies,Labels,Avery 508,11.78,3,4.27,2,"(40.349457, -88.986137)",40.349457,-88.986137
2,2014-01-04,Phillina Ober,Illinois,Office Supplies,Storage,SAFCO Boltless Steel Shelving,272.74,3,-64.77,3,"(40.349457, -88.986137)",40.349457,-88.986137
3,2014-01-04,Phillina Ober,Illinois,Office Supplies,Binders,GBC Standard Plastic Binding Systems Combs,3.54,2,-5.49,4,"(40.349457, -88.986137)",40.349457,-88.986137
4,2014-01-05,Mick Brown,Pennsylvania,Office Supplies,Art,Avery Hi-Liter EverBold Pen Style Fluorescent ...,19.54,3,4.88,5,"(40.590752, -77.209755)",40.590752,-77.209755


In [40]:
# Get historical weather data for each state
weather_records = []

# Get unique states with their coordinates
unique_states = data[['State', 'Latitude', 'Longitude']].drop_duplicates()

print(f"Getting weather data for {len(unique_states)} unique states...")


states_with_data = 0
for i, row in unique_states.iterrows():
    state = row['State']
    lat, lon = row['Latitude'], row['Longitude']
    location = Point(lat, lon)

    try:
        w = Daily(location, start, end).fetch()
        if not w.empty:
            w = w.reset_index()
            w['State'] = state
            w['Latitude'] = lat
            w['Longitude'] = lon
            weather_records.append(w)
            print(f"Weather data retrieved for {state}")
            states_with_data += 1
    except Exception as e:
        print(f"Failed to fetch weather for {state}: {e}")
        continue

if weather_records:
    weather_data = pd.concat(weather_records, ignore_index=True)
    print(f"\n Using state weather data for {states_with_data}/{len(unique_states)} states")
#If no coords for state (Mary suggestion: use USA center?)
else:
    print("\nNo state weather data retrieved. Using central US fallback...")
    central_location = Point(39.8283, -98.5795)  # Geographic center of US (Kansas according to Wikipedia)
    
    try:
        weather_data = Daily(central_location, start, end).fetch()
        if not weather_data.empty:
            weather_data = weather_data.reset_index()
            print("✓ Using central US weather data for all states")
        else:
            print("No weather data available")
            weather_data = pd.DataFrame()
    except Exception as e:
        print(f"Error: {e}")
        weather_data = pd.DataFrame()

# Display
print(f"\nWeather data shape: {weather_data.shape}")
display(weather_data.head())



Getting weather data for 49 unique states...
Weather data retrieved for Texas
Weather data retrieved for Illinois
Weather data retrieved for California
Weather data retrieved for Kentucky
Weather data retrieved for Virginia
Weather data retrieved for South Carolina
Weather data retrieved for Ohio
Weather data retrieved for Arkansas
Weather data retrieved for Michigan
Weather data retrieved for Tennessee
Weather data retrieved for Florida
Weather data retrieved for Nevada
Weather data retrieved for Indiana
Weather data retrieved for New York
Weather data retrieved for Wisconsin
Weather data retrieved for New Jersey
Weather data retrieved for Missouri
Weather data retrieved for North Carolina
Weather data retrieved for Utah
Weather data retrieved for Minnesota
Weather data retrieved for Iowa
Weather data retrieved for Massachusetts
Weather data retrieved for Maryland
Weather data retrieved for Connecticut
Weather data retrieved for New Hampshire
Weather data retrieved for Oklahoma
Weathe

Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,State,Latitude,Longitude
0,2014-01-03,3.7,-5.0,13.0,0.0,,,11.4,,1030.0,,Texas,31.054487,-97.563461
1,2014-01-04,12.9,5.0,22.0,0.0,,,20.3,,1015.9,,Texas,31.054487,-97.563461
2,2014-01-05,5.0,-1.0,14.0,0.0,,,25.2,,1023.1,,Texas,31.054487,-97.563461
3,2014-01-06,-4.1,-8.0,0.0,0.0,,,12.9,,1041.3,,Texas,31.054487,-97.563461
4,2014-01-07,0.7,-7.0,8.0,0.0,,,13.7,,1035.6,,Texas,31.054487,-97.563461


In [41]:
#fix prcp and other null

weather_data['prcp'] = weather_data['prcp'].replace('<NA>', np.nan)
weather_data['tavg'] = weather_data['tavg'].replace('<NA>', np.nan)

# Fill precipitation w 0 if null
weather_data['prcp'] = weather_data['prcp'].fillna(0)

# Fill temp w 15C if null
weather_data['tavg'] = weather_data['tavg'].fillna(15)



In [53]:
# Add description to the data
def get_weather_description(row):
    temp = row['tavg']  # average temperature
    precip = row['prcp']  # precipitation
    
    # Handle missing temperature or precipitation
    if pd.isna(temp) or temp == '<NA>':
        temp = 20  # Default average temperature
    if pd.isna(precip) or precip == '<NA>':
        precip = 0  # Default no precipitation

    
    try:
        temp = float(temp)
        precip = float(precip)
    except (ValueError, TypeError):
        temp = 15
        precip = 0
    
    if precip > 15: 
        return "Heavy Rain"
    elif precip > 8:
        return "Rainy"
    elif precip > 2:
        return "Light Rain" 
    elif temp > 25:
        return "Sunny"
    elif temp > 10:
        return "Partly Cloudy"
    else: 
        return "Warm and Humid"

# Apply weather descriptions
weather_data['weather'] = weather_data.apply(get_weather_description, axis=1)

print("Weather text descriptions:")
print(weather_data['weather'].value_counts())
print(weather_data[['tavg', 'prcp', 'weather']].head(10))

Weather text descriptions:
weather
Partly Cloudy     21812
Warm and Humid    14881
Sunny              3675
Light Rain         1600
Heavy Rain          972
Rainy               764
Name: count, dtype: int64
   tavg  prcp         weather
0   3.7   0.0  Warm and Humid
1  12.9   0.0   Partly Cloudy
2   5.0   0.0  Warm and Humid
3  -4.1   0.0  Warm and Humid
4   0.7   0.0  Warm and Humid
5   7.6   0.9  Warm and Humid
6  12.8   0.3   Partly Cloudy
7  18.2   0.0   Partly Cloudy
8  12.5   0.0   Partly Cloudy
9  13.6   0.0   Partly Cloudy


In [43]:
# Create weather_data_reset for merging
weather_data_reset = weather_data.reset_index()
weather_data_reset.rename(columns={'time': 'Date'}, inplace=True)

print(f"Shape: {weather_data_reset.shape}")
display(weather_data_reset[['Date', 'weather','prcp']].head())

# Rename data column to match
data_merged = data.rename(columns={'Order Date': 'Date'})
data_with_weather = pd.merge(data_merged, weather_data_reset[['Date','weather','tavg','prcp','wspd']], on='Date', how='left')

Shape: (43704, 16)


Unnamed: 0,Date,weather,prcp
0,2014-01-03,Warm and Humid,0.0
1,2014-01-04,Partly Cloudy,0.0
2,2014-01-05,Warm and Humid,0.0
3,2014-01-06,Warm and Humid,0.0
4,2014-01-07,Warm and Humid,0.0


In [44]:
# Export
data_with_weather.to_csv('retail_data_with_weather.csv', index=False)
display(data_with_weather)

Unnamed: 0,Date,Customer Name,State,Category,Sub-Category,Product Name,Sales,Quantity,Profit,Transaction ID,Coords,Latitude,Longitude,weather,tavg,prcp,wspd
0,2014-01-03,Darren Powers,Texas,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.45,2,5.55,1,"(31.054487, -97.563461)",31.054487,-97.563461,Warm and Humid,3.7,0.0,11.4
1,2014-01-03,Darren Powers,Texas,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.45,2,5.55,1,"(31.054487, -97.563461)",31.054487,-97.563461,Warm and Humid,-8.0,0.0,26.7
2,2014-01-03,Darren Powers,Texas,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.45,2,5.55,1,"(31.054487, -97.563461)",31.054487,-97.563461,Warm and Humid,7.6,0.0,2.6
3,2014-01-03,Darren Powers,Texas,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.45,2,5.55,1,"(31.054487, -97.563461)",31.054487,-97.563461,Warm and Humid,-10.2,0.0,6.2
4,2014-01-03,Darren Powers,Texas,Office Supplies,Paper,"Message Book, Wirebound, Four 5 1/2"" X 4"" Form...",16.45,2,5.55,1,"(31.054487, -97.563461)",31.054487,-97.563461,Warm and Humid,-4.5,0.0,11.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299723,2017-12-30,Jill Matthias,Colorado,Office Supplies,Fasteners,Bagged Rubber Bands,3.02,3,-0.60,9994,"(39.059811, -105.311104)",39.059811,-105.311104,Warm and Humid,-3.8,0.0,13.5
299724,2017-12-30,Jill Matthias,Colorado,Office Supplies,Fasteners,Bagged Rubber Bands,3.02,3,-0.60,9994,"(39.059811, -105.311104)",39.059811,-105.311104,Warm and Humid,-15.4,0.0,18.4
299725,2017-12-30,Jill Matthias,Colorado,Office Supplies,Fasteners,Bagged Rubber Bands,3.02,3,-0.60,9994,"(39.059811, -105.311104)",39.059811,-105.311104,Warm and Humid,-19.9,0.0,1.1
299726,2017-12-30,Jill Matthias,Colorado,Office Supplies,Fasteners,Bagged Rubber Bands,3.02,3,-0.60,9994,"(39.059811, -105.311104)",39.059811,-105.311104,Warm and Humid,-10.3,1.5,4.7


In [45]:
# --- One-hot encode categorical data ---
# LOUE NOTE: Do this immediately before tha association analysis
# Or merge df with data because df contains only the one-hot encoded data

transactions = data.groupby('Transaction ID')['Category'].apply(list).values.tolist()
transactions = data.groupby('Transaction ID')['Sub-Category'].apply(list).values.tolist()

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

df.head()

Unnamed: 0,Accessories,Appliances,Art,Binders,Bookcases,Chairs,Copiers,Envelopes,Fasteners,Furnishings,Labels,Machines,Paper,Phones,Storage,Supplies,Tables
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [46]:
# Group data by weather and state
cluster_data = data_with_weather.groupby(['weather', 'State']).agg({
    'Sales': 'sum',
    'Quantity': 'sum', 
    'Profit': 'sum',
    'Transaction ID': 'count',
    'tavg': 'mean',
}).reset_index()

cluster_data.rename(columns={'Transaction ID': 'Transaction_Count'}, inplace=True)

print(f"Cluster data shape: {cluster_data.shape}")
print(cluster_data.head())



Cluster data shape: (286, 7)
      weather       State      Sales  Quantity    Profit  Transaction_Count  \
0  Heavy Rain     Alabama    7077.33       101   2487.44                 26   
1  Heavy Rain     Arizona   26108.62       658  -1477.38                168   
2  Heavy Rain    Arkansas   11055.02       173   4183.54                 48   
3  Heavy Rain  California  342993.23      5201  53614.17               1335   
4  Heavy Rain    Colorado   19017.48       414  -3880.95                106   

        tavg  
0  12.961538  
1  17.400595  
2  12.839583  
3  14.720075  
4  18.373585  


In [48]:
# Select numerical features for clustering
features_for_clustering = ['Sales', 'Quantity', 'Profit', 'Transaction_Count', 'tavg']
X = cluster_data[features_for_clustering].fillna(0)

print("Features for clustering:")
print(features_for_clustering)
print(f"Data shape: {X.shape}")

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features standardized successfully")

Features for clustering:
['Sales', 'Quantity', 'Profit', 'Transaction_Count', 'tavg']
Data shape: (286, 5)
Features standardized successfully


In [None]:
# Determine optimal number of clusters
wcss = []
silhouette_scores = []
k_range = range(2, 8)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

In [51]:
# Apply K-means
kmeans = KMeans(random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)
cluster_data['Cluster'] = cluster_labels

print("Clustering completed successfully")
print(f"Cluster distribution:\n{cluster_data['Cluster'].value_counts().sort_index()}")

Clustering completed successfully
Cluster distribution:
Cluster
0    161
1      1
2     11
3     54
4     12
5      3
6     41
7      3
Name: count, dtype: int64


In [55]:
# Group data by weather conditions
cluster_data = data_with_weather.groupby('weather').agg({
    'Sales': 'sum',
    'Quantity': 'sum', 
    'Profit': 'sum',
    'Transaction ID': 'count',
    'tavg': 'mean',
    'prcp': 'mean'
}).reset_index()

cluster_data.rename(columns={'Transaction ID': 'Transaction_Count'}, inplace=True)

print(f"Cluster data shape: {cluster_data.shape}")
print(cluster_data)

Cluster data shape: (6, 7)
          weather        Sales  Quantity      Profit  Transaction_Count  \
0      Heavy Rain   1433018.89     23501   190878.67               6142   
1      Light Rain   2027221.50     34594   242713.00               9144   
2   Partly Cloudy  34916887.10    576289  4519316.45             152837   
3           Rainy   1242189.46     19886   153968.20               5280   
4           Sunny   5210145.52     92437   606174.77              24576   
5  Warm and Humid  24071116.17    389157  2876196.70             101749   

        tavg       prcp  
0  14.700668   30.68774  
1   12.61449   4.938276  
2  17.595468   0.052769  
3  13.036477  11.325852  
4  27.217167    0.05072  
5   2.018539   0.065829  


In [None]:
# Create transactions with weather
weather_transactions = data_with_weather.groupby(['Transaction ID']).agg({
    'Sub-Category': lambda x: list(x),
    'weather': 'first'
}).reset_index()

print(f"Number of transactions: {len(weather_transactions)}")
print(weather_transactions.head())

Number of transactions: 9994
   Transaction ID                                       Sub-Category  \
0               1  [Paper, Paper, Paper, Paper, Paper, Paper, Pap...   
1               2  [Labels, Labels, Labels, Labels, Labels, Label...   
2               3  [Storage, Storage, Storage, Storage, Storage, ...   
3               4  [Binders, Binders, Binders, Binders, Binders, ...   
4               5  [Art, Art, Art, Art, Art, Art, Art, Art, Art, ...   

          weather  
0  Warm and Humid  
1   Partly Cloudy  
2   Partly Cloudy  
3   Partly Cloudy  
4  Warm and Humid  


In [None]:
# Create basket data
weather_basket_data = []

for _, row in weather_transactions.iterrows():
    transaction = []
    # Add sub-categories
    transaction.extend([f"Cat_{cat}" for cat in row['Sub-Category']])
    # Add weather condition
    transaction.append(f"Weather_{row['weather']}")
    
    weather_basket_data.append(transaction)

print(f"Created {len(weather_basket_data)} weather transactions")
print("Sample transaction:", weather_basket_data[0])

Created 9994 weather transactions
Sample transaction: ['Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Cat_Paper', 'Weather_Warm and Humid']


In [None]:
# Apply transaction encoder for weather data
te_weather = TransactionEncoder()
te_ary_weather = te_weather.fit(weather_basket_data).transform(weather_basket_data)
df_weather = pd.DataFrame(te_ary_weather, columns=te_weather.columns_)

print(f"Weather transaction matrix shape: {df_weather.shape}")
print(f"Number of items: {len(te_weather.columns_)}")

Weather transaction matrix shape: (9994, 23)
Number of items: 23


In [None]:
# Find frequent itemsets with weather
frequent_itemsets_weather = apriori(df_weather, min_support=0.01, use_colnames=True, max_len=3)
frequent_itemsets_weather['length'] = frequent_itemsets_weather['itemsets'].apply(lambda x: len(x))

print(f"Found {len(frequent_itemsets_weather)} weather frequent itemsets")
print("\nTop 10 weather frequent itemsets:")
print(frequent_itemsets_weather.sort_values('support', ascending=False).head(10))

Found 51 weather frequent itemsets

Top 10 weather frequent itemsets:
     support                              itemsets  length
16  0.538223               (Weather_Partly Cloudy)       1
17  0.313588                       (Weather_Sunny)       1
3   0.152391                         (Cat_Binders)       1
11  0.137082                           (Cat_Paper)       1
18  0.134981              (Weather_Warm and Humid)       1
8   0.095757                     (Cat_Furnishings)       1
12  0.088953                          (Cat_Phones)       1
13  0.084651                         (Cat_Storage)       1
27  0.080949  (Cat_Binders, Weather_Partly Cloudy)       2
2   0.079648                             (Cat_Art)       1


In [None]:
# Generate Weather Association Rules (Adjusted)
if len(frequent_itemsets_weather) > 0:
    # Try with lower thresholds
    rules_weather = association_rules(frequent_itemsets_weather, metric="confidence", min_threshold=0.1)
    rules_weather = rules_weather.sort_values('lift', ascending=False)
    
    print(f"Generated {len(rules_weather)} weather association rules")
    
    if len(rules_weather) > 0:
        # Display top weather rules
        display_cols = ['antecedents', 'consequents', 'support', 'confidence', 'lift']
        print("\nTop weather association rules:")
        print(rules_weather[display_cols].head(15))
    else:
        print("\nAll frequent itemsets:")
        print(frequent_itemsets_weather)
    

Generated 39 weather association rules

Top weather association rules:
                 antecedents               consequents   support  confidence  \
22         (Cat_Furnishings)  (Weather_Warm and Humid)  0.014209    0.148380   
21  (Weather_Warm and Humid)         (Cat_Furnishings)  0.014209    0.105263   
36             (Cat_Storage)  (Weather_Warm and Humid)  0.012207    0.144208   
10             (Cat_Binders)           (Weather_Sunny)  0.050630    0.332239   
11           (Weather_Sunny)             (Cat_Binders)  0.050630    0.161455   
17           (Cat_Envelopes)   (Weather_Partly Cloudy)  0.014409    0.566929   
18           (Cat_Fasteners)   (Weather_Partly Cloudy)  0.012207    0.562212   
35             (Cat_Storage)           (Weather_Sunny)  0.027517    0.325059   
24              (Cat_Labels)           (Weather_Sunny)  0.011807    0.324176   
32              (Cat_Phones)           (Weather_Sunny)  0.028717    0.322835   
38              (Cat_Tables)   (Weather_Partly Cl

In [None]:
#Frequent Itemsets fallback in case it fails me again :<

print(f"Number of frequent itemsets: {len(frequent_itemsets_weather)}")
print(f"Itemsets with weather:")
weather_itemsets = frequent_itemsets_weather[
    frequent_itemsets_weather['itemsets'].apply(lambda x: any('Weather' in item for item in x))
]
print(weather_itemsets)

print(f"\nAll columns in transaction matrix:")
print(df_weather.columns.tolist())

print(f"\nSample of transaction matrix:")
print(df_weather.head())

Number of frequent itemsets: 51
Itemsets with weather:
     support                                   itemsets  length
16  0.538223                    (Weather_Partly Cloudy)       1
17  0.313588                            (Weather_Sunny)       1
18  0.134981                   (Weather_Warm and Humid)       1
19  0.042325   (Weather_Partly Cloudy, Cat_Accessories)       2
20  0.023114           (Weather_Sunny, Cat_Accessories)       2
21  0.010606  (Weather_Warm and Humid, Cat_Accessories)       2
22  0.025315    (Weather_Partly Cloudy, Cat_Appliances)       2
23  0.014509            (Weather_Sunny, Cat_Appliances)       2
24  0.042826           (Weather_Partly Cloudy, Cat_Art)       2
25  0.025115                   (Weather_Sunny, Cat_Art)       2
26  0.010206          (Weather_Warm and Humid, Cat_Art)       2
27  0.080949       (Cat_Binders, Weather_Partly Cloudy)       2
28  0.050630               (Cat_Binders, Weather_Sunny)       2
29  0.019412      (Cat_Binders, Weather_Warm and 

In [None]:
# Check Transaction Matrix


print("Weather-related columns in transaction matrix:")
weather_cols = [col for col in df_weather.columns if 'Weather' in col]
print(weather_cols)

print(f"\nSupport for each weather type:")
for col in weather_cols:
    support = df_weather[col].mean()
    print(f"{col}: {support:.3f}")

print(f"\nSupport for some product categories:")
product_cols = [col for col in df_weather.columns if 'Cat_' in col][:5]
for col in product_cols:
    support = df_weather[col].mean()
    print(f"{col}: {support:.3f}")

Weather-related columns in transaction matrix:
['Weather_Heavy Rain', 'Weather_Light Rain', 'Weather_Partly Cloudy', 'Weather_Rainy', 'Weather_Sunny', 'Weather_Warm and Humid']

Support for each weather type:
Weather_Heavy Rain: 0.006
Weather_Light Rain: 0.003
Weather_Partly Cloudy: 0.538
Weather_Rainy: 0.004
Weather_Sunny: 0.314
Weather_Warm and Humid: 0.135

Support for some product categories:
Cat_Accessories: 0.078
Cat_Appliances: 0.047
Cat_Art: 0.080
Cat_Binders: 0.152
Cat_Bookcases: 0.023


In [None]:
# Create simple rules based on conditional probabilities
weather_types = data_with_weather['weather'].unique()

for weather in weather_types:
    weather_data = data_with_weather[data_with_weather['weather'] == weather]
    total_weather_transactions = len(weather_data['Transaction ID'].unique())
    
    if total_weather_transactions > 0:
        print(f"\n{weather} (Total transactions: {total_weather_transactions}):")
        
        # Find top products in this weather
        product_counts = weather_data['Sub-Category'].value_counts().head(5)
        
        for product, count in product_counts.items():
            overall_count = len(data_with_weather[data_with_weather['Sub-Category'] == product]['Transaction ID'].unique())
            overall_pct = (overall_count / len(data_with_weather['Transaction ID'].unique())) * 100
            weather_pct = (count / total_weather_transactions) * 100
            lift = weather_pct / overall_pct if overall_pct > 0 else 0
            
            print(f"  {product}: {weather_pct:.1f}% (Overall: {overall_pct:.1f}%, Lift: {lift:.2f})")


Warm and Humid (Total transactions: 6495):
  Binders: 234.8% (Overall: 15.2%, Lift: 15.41)
  Paper: 207.0% (Overall: 13.7%, Lift: 15.10)
  Furnishings: 155.4% (Overall: 9.6%, Lift: 16.23)
  Phones: 142.6% (Overall: 8.9%, Lift: 16.04)
  Storage: 133.7% (Overall: 8.5%, Lift: 15.79)

Light Rain (Total transactions: 5418):
  Binders: 25.5% (Overall: 15.2%, Lift: 1.67)
  Paper: 23.4% (Overall: 13.7%, Lift: 1.71)
  Phones: 15.5% (Overall: 8.9%, Lift: 1.74)
  Furnishings: 15.4% (Overall: 9.6%, Lift: 1.61)
  Storage: 14.6% (Overall: 8.5%, Lift: 1.72)

Rainy (Total transactions: 3692):
  Binders: 22.2% (Overall: 15.2%, Lift: 1.46)
  Paper: 19.5% (Overall: 13.7%, Lift: 1.42)
  Furnishings: 14.5% (Overall: 9.6%, Lift: 1.51)
  Storage: 12.9% (Overall: 8.5%, Lift: 1.52)
  Phones: 11.9% (Overall: 8.9%, Lift: 1.34)

Partly Cloudy (Total transactions: 9991):
  Binders: 233.5% (Overall: 15.2%, Lift: 15.32)
  Paper: 213.9% (Overall: 13.7%, Lift: 15.61)
  Furnishings: 144.3% (Overall: 9.6%, Lift: 15.07)