In [21]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from datetime import datetime
from meteostat import Point, Daily

In [22]:
# --- Load Dataset ---
# Import the dataset uploaded to the repo
url = "https://raw.githubusercontent.com/MaharLeika18/Data-Mining---Python/refs/heads/main/Final_Exam/retail_sales_dataset.csv"
data = pd.read_csv(url)

print("Data loaded successfully!")
print(data.head())

Data loaded successfully!
   Transaction ID        Date Customer ID  Gender  Age Product Category  \
0               1  2023-11-24     CUST001    Male   34           Beauty   
1               2  2023-02-27     CUST002  Female   26         Clothing   
2               3  2023-01-13     CUST003    Male   50      Electronics   
3               4  2023-05-21     CUST004    Male   37         Clothing   
4               5  2023-05-06     CUST005    Male   30           Beauty   

   Quantity  Price per Unit  Total Amount  
0         3              50           150  
1         2             500          1000  
2         1              30            30  
3         1             500           500  
4         2              50           100  


In [23]:
# --- Data Preprocessing ---
# One-hot encode all items 
transactions = data.groupby('Customer ID')['Product Category'].apply(list).values.tolist()

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

print("\nTransaction data prepared for analysis:")
print(df.head())


Transaction data prepared for analysis:
   Beauty  Clothing  Electronics
0    True     False        False
1   False      True        False
2   False     False         True
3   False      True        False
4    True     False        False


In [24]:
# Convert dates to datetime
data['Date'] = pd.to_datetime(data['Date'])
start = data['Date'].min()
end = data['Date'].max()
print(f"Date range: {start.date()} to {end.date()}")

Date range: 2023-01-01 to 2024-01-01


In [25]:
#Get historical weather data for Manila
location = Point(14.5995, 120.9842, 70)

weather_data = Daily(location, start, end)
weather_data = weather_data.fetch()

weather_data.head()

Unnamed: 0_level_0,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-01-01,27.2,24.7,29.5,1.3,,,5.9,,1013.2,
2023-01-02,27.8,25.7,31.7,0.1,,,6.5,,1013.0,
2023-01-03,26.1,25.5,29.2,1.4,,,7.5,,1012.3,
2023-01-04,27.0,23.7,29.7,16.9,,,8.5,,1012.0,
2023-01-05,26.1,24.8,27.7,4.3,,,7.2,,1012.2,


In [26]:
# Add description to the data
def get_weather_description(row):
    temp = row['tavg']  # average temperature
    precip = row['prcp']  # precipitation
    
    if pd.isna(temp) or pd.isna(precip):
        return "Unknown"
    
    if precip > 10: 
        return "Heavy Rain"
    elif precip > 5:
        return "Rainy"
    elif precip > 1:
        return "Light Rain" 
    elif temp > 29:
        return "Sunny"
    elif temp > 27:
        return "Partly Cloudy"
    else: 
        return "Warm and Humid"

# Apply weather descriptions
weather_data['weather'] = weather_data.apply(get_weather_description, axis=1)

print("Weather text descriptions:")
print(weather_data['weather'].value_counts())
weather_data[['tavg', 'prcp', 'weather']].head(10)

Weather text descriptions:
weather
Light Rain        89
Partly Cloudy     79
Heavy Rain        66
Sunny             61
Warm and Humid    45
Rainy             26
Name: count, dtype: int64


Unnamed: 0_level_0,tavg,prcp,weather
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-01,27.2,1.3,Light Rain
2023-01-02,27.8,0.1,Partly Cloudy
2023-01-03,26.1,1.4,Light Rain
2023-01-04,27.0,16.9,Heavy Rain
2023-01-05,26.1,4.3,Light Rain
2023-01-06,26.0,5.9,Rainy
2023-01-07,26.7,0.0,Warm and Humid
2023-01-08,25.7,0.0,Warm and Humid
2023-01-09,26.0,0.0,Warm and Humid
2023-01-10,26.1,0.0,Warm and Humid


In [27]:
# Create weather_data_reset for merging
weather_data_reset = weather_data.reset_index()
weather_data_reset.rename(columns={'time': 'Date'}, inplace=True)

print(f"Shape: {weather_data_reset.shape}")
display(weather_data_reset[['Date', 'weather']].head())

# Merge 
data_with_weather = pd.merge(data, weather_data_reset[['Date', 'weather']], on='Date', how='left')

Shape: (366, 12)


Unnamed: 0,Date,weather
0,2023-01-01,Light Rain
1,2023-01-02,Partly Cloudy
2,2023-01-03,Light Rain
3,2023-01-04,Heavy Rain
4,2023-01-05,Light Rain


In [28]:
# Export
data_with_weather.to_csv('retail_data_with_weather.csv', index=False)