In [1]:
#Import Dependencies
import pandas as pd
from pathlib import Path
import requests

#Import the Weather VisualCrossing API Key
from api_keys import visualcrossing_api_key

In [7]:
customer_shopping_data_csv = Path("./Resources/source/customer_shopping_data.csv")

customer_shopping_data_df = pd.read_csv(customer_shopping_data_csv)
customer_shopping_data_df.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
0,I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,5/8/2022,Kanyon
1,I317333,C111565,Male,21,Shoes,3,1800.51,Debit Card,12/12/2021,Forum Istanbul
2,I127801,C266599,Male,20,Clothing,1,300.08,Cash,9/11/2021,Metrocity
3,I173702,C988172,Female,66,Shoes,5,3000.85,Credit Card,16/05/2021,Metropol AVM
4,I337046,C189076,Female,53,Books,4,60.6,Cash,24/10/2021,Kanyon


In [12]:
#Translate the date from a string to the datetime type using pd.to_datetime, specify the source format as day/month/year, then convert to date - .dt.date
#this is necessary so that the date is in a format compatible with the API
dates_df = pd.to_datetime(customer_shopping_data_df["invoice_date"], format="%d/%m/%Y").dt.date
dates_df = dates_df.drop_duplicates(keep="first")
dates_df = pd.DataFrame(dates_df)
dates_df

Unnamed: 0,invoice_date
0,2022-08-05
1,2021-12-12
2,2021-11-09
3,2021-05-16
4,2021-10-24
...,...
4220,2021-01-29
4296,2021-01-01
4491,2021-08-12
4505,2022-01-08


In [13]:
#To optimize API usage and reduce the number of requests, it is decided to pass a date range directly to the API, covering an entire month.
#For this, it is necessary to create a new DataFrame with two columns - first_date_of_month and last_date_of_month, which will be used as parameters for the API.
#A function that, given an input date (in date format), returns the first and last date of the corresponding month.

def first_last_dates_of_month(date):
    first_date = date.replace(day=1)
    last_date = pd.Timestamp(date.year, date.month, pd.Period(date, freq='M').days_in_month)
    return first_date, last_date

#Apply function to each row of the DataFrame

dates_df[['first_date_of_month', 'last_date_of_month']] = dates_df['invoice_date'].apply(lambda x: pd.Series(first_last_dates_of_month(x)))

dates_df.head()

Unnamed: 0,invoice_date,first_date_of_month,last_date_of_month
0,2022-08-05,2022-08-01,2022-08-31
1,2021-12-12,2021-12-01,2021-12-31
2,2021-11-09,2021-11-01,2021-11-30
3,2021-05-16,2021-05-01,2021-05-31
4,2021-10-24,2021-10-01,2021-10-31


In [14]:
dates_df = dates_df[["first_date_of_month", "last_date_of_month"]]
dates_df = dates_df.drop_duplicates(keep="first")
dates_df.sort_values(by="first_date_of_month")

Unnamed: 0,first_date_of_month,last_date_of_month
7,2021-01-01,2021-01-31
62,2021-02-01,2021-02-28
54,2021-03-01,2021-03-31
29,2021-04-01,2021-04-30
3,2021-05-01,2021-05-31
49,2021-06-01,2021-06-30
18,2021-07-01,2021-07-31
9,2021-08-01,2021-08-31
40,2021-09-01,2021-09-30
4,2021-10-01,2021-10-31


In [17]:
#Set the API base URL
base_url = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/"
city_name = "Istanbul"

#Create a set of empty lists, which will be filled with all weather parameters (names taken from the API)
date = []
tempmax = []
tempmin = []
temp = []
feelslikemax = []
feelslikemin = []
feelslike = []
dew = []
humidity = []
precip = []
precipprob = []
precipcover = []
preciptype = []
snow = []
snowdepth = []
windgust = []
windspeed = []
winddir = []
pressure = []
cloudcover = []
visibility = []
uvindex = []
sunrise = []
sunset = []
conditions = []
description = []

#Start a loop over the DataFrame containing date ranges. For testing purposes, head(3) is used.
for index, row in dates_df.head(3).iterrows():  #for index, row in dates_df.iterrows():
    start_date = row['first_date_of_month'].date()
    end_date = row['last_date_of_month'].date()

    url = f"{base_url}/{city_name}/{start_date}/{end_date}?unitGroup=us&include=days&key={visualcrossing_api_key}&contentType=json"

    response = requests.get(url)
        
    #Convert response to JSON
    data = response.json()

    #Since the API returns a set of dates (1 month), a loop is needed for each day to extract the data.
    #Perform a loop over the days list.

    for d in data["days"]:
        date.append(d["datetime"])
        tempmax.append(d["tempmax"])
        tempmin.append(d["tempmin"])
        temp.append(d["temp"])
        feelslikemax.append(d["feelslikemax"])
        feelslikemin.append(d["feelslikemin"])
        feelslike.append(d["feelslike"])
        dew.append(d["dew"])
        humidity.append(d["humidity"])
        precip.append(d["precip"])
        precipprob.append(d["precipprob"])
        precipcover.append(d["precipcover"])

       #Since 'preciptype' is returned either as a null value (None) or as a list,
        #for example, ["rain"] or ["rain", "snow"],
        #the purpose of this code is to extract all values from the list (if it is not empty) and list them separated by commas,
        #for instance, ["rain"] -> "rain", ["rain", "snow"] -> "rain, snow"
        #The goal is to avoid storing a list in the final dataset.

        if d["preciptype"] != None:
            preciptype.append(', '.join(d["preciptype"]))
        else:
            preciptype.append(d["preciptype"])

        snow.append(d["snow"])
        snowdepth.append(d["snowdepth"])
        windgust.append(d["windgust"])
        windspeed.append(d["windspeed"])
        winddir.append(d["winddir"])
        pressure.append(d["pressure"])
        cloudcover.append(d["cloudcover"])
        visibility.append(d["visibility"])
        uvindex.append(d["uvindex"])
        sunrise.append(d["sunrise"])
        sunset.append(d["sunset"])
        conditions.append(d["conditions"])
        description.append(d["description"])

https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline//Istanbul/2022-08-01/2022-08-31?unitGroup=us&include=days&key=7XS22D8HXWBYRJQEHA78ZHRTM&contentType=json
<Response [429]>
https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline//Istanbul/2021-12-01/2021-12-31?unitGroup=us&include=days&key=7XS22D8HXWBYRJQEHA78ZHRTM&contentType=json
<Response [429]>
https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline//Istanbul/2021-11-01/2021-11-30?unitGroup=us&include=days&key=7XS22D8HXWBYRJQEHA78ZHRTM&contentType=json
<Response [429]>


In [62]:
#Assemble a DataFrame from the lists, as it simplifies saving the data to a file.

weather_df = pd.DataFrame({
    "date": date,
    "tempmax": tempmax,
    "tempmin": tempmin,
    "temp": temp,
    "feelslikemax": feelslikemax,
    "feelslikemin": feelslikemin,
    "feelslike": feelslike,
    "dew": dew,
    "humidity": humidity,
    "precip": precip,
    "precipprob": precipprob,
    "precipcover": precipcover,
    "preciptype": preciptype,
    "snow": snow,
    "snowdepth": snowdepth,
    "windgust": windgust,
    "windspeed": windspeed,
    "winddir": winddir,
    "pressure": pressure,
    "cloudcover": cloudcover,
    "visibility": visibility,
    "uvindex": uvindex,
    "sunrise": sunrise,
    "sunset": sunset,
    "conditions": conditions,
    "description": description
})

weather_df.head()

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,windspeed,winddir,pressure,cloudcover,visibility,uvindex,sunrise,sunset,conditions,description
0,2022-08-01,85.8,74.6,79.8,85.1,74.6,80.4,65.0,62.8,0.0,...,12.5,30.7,1009.4,31.0,7.7,8.0,05:59:38,20:20:37,Partially cloudy,Partly cloudy throughout the day.
1,2022-08-02,84.0,70.2,77.5,86.7,70.2,78.4,63.2,61.7,0.0,...,16.8,22.6,1011.3,22.8,7.8,,06:00:36,20:19:31,Partially cloudy,Partly cloudy throughout the day.
2,2022-08-03,83.0,72.8,77.8,86.9,72.8,79.0,67.7,71.3,0.0,...,20.0,27.1,1012.9,48.8,8.0,,06:01:34,20:18:23,Partially cloudy,Partly cloudy throughout the day.
3,2022-08-04,82.2,75.0,78.2,85.5,75.0,79.1,67.8,70.6,0.0,...,17.9,31.6,1013.2,39.6,7.9,9.0,06:02:33,20:17:13,Partially cloudy,Partly cloudy throughout the day.
4,2022-08-05,83.3,73.6,78.6,87.5,73.6,80.0,68.7,71.9,0.0,...,16.4,39.7,1012.4,36.7,7.9,,06:03:31,20:16:03,Partially cloudy,Partly cloudy throughout the day.


In [63]:
#Save the data to a file - include column names, but do not include the index.
weather_df.to_csv("./Resources/output/Istanbul_historical_weather.csv", header=True, index=False)