In [3]:
# this installs the pandas library - uncomment the line below if you don't have pandas installed, run it once, and comment it again
#! pip install pandas



In [4]:
import pandas as pd
import requests


# you should make a separate "open_weather_key.py" file and place your OpenWeatherAPI key in there
# link to making an account: https://home.openweathermap.org/users/sign_up
# from open_weather_key import API_KEY
API_KEY = "0878eb0b832579480c9b6691e9c26e11"

In [5]:
# read the data, downloaded from here & unzipped: https://www.kaggle.com/datasets/juanmah/world-cities
DATA_PATH = '../data/worldcities.csv'

In [6]:
# read the csv data using pandas. It will be a pd.DataFrame object
cities_df = pd.read_csv(DATA_PATH)

# the pd.DataFrame.head() method displays the first 5 rows
cities_df.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37732000.0,1392685764
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629


In [13]:
# sort the dataframe according to the latitude column
# and make a new dataframe of 5 southernmost cities
south_5 = cities_df.sort_values("lat")[:5]

In [14]:
south_5["lat"]

43793   -54.9333
7018    -54.8019
909     -54.2833
44648   -54.2806
13663   -53.7833
Name: lat, dtype: float64

In [15]:
# get the first element of the "lat" (latitude) and "lng" (longitude) columns. The indices from the original file are preserved,
# hence the "iloc" method - it takes the first elemen=t regardless of the index
latitude = south_5["lat"].iloc[0] 
longitude = south_5["lng"].iloc[0]

In [16]:
def get_weather_api_url(latitude, longitude):
    return f"https://api.openweathermap.org/data/2.5/weather?lat={latitude}&lon={longitude}&appid={API_KEY}"

In [17]:
# Let's get and preview weather data about the southernmost city in the dataset! (Should be Puerto Williams)
#request_url = f"https://api.openweathermap.org/data/2.5/weather?lat={latitude}&lon={longitude}&appid={API_KEY}"
response = requests.get(get_weather_api_url(latitude, longitude))

print(response.status_code) # this should print 200 for a successful request.
# If it's not working in VSCode, try running the upyter notebook in the browser.
# Let's preview the json with weather info
response.json()

200


{'coord': {'lon': -67.6167, 'lat': -54.9333},
 'weather': [{'id': 500,
   'main': 'Rain',
   'description': 'light rain',
   'icon': '10n'}],
 'base': 'stations',
 'main': {'temp': 279.29,
  'feels_like': 279.29,
  'temp_min': 279.29,
  'temp_max': 279.29,
  'pressure': 978,
  'humidity': 95,
  'sea_level': 978,
  'grnd_level': 978},
 'visibility': 5501,
 'wind': {'speed': 0.29, 'deg': 233, 'gust': 0.9},
 'rain': {'1h': 0.3},
 'clouds': {'all': 96},
 'dt': 1710371706,
 'sys': {'type': 1,
  'id': 8303,
  'country': 'CL',
  'sunrise': 1710325127,
  'sunset': 1710370858},
 'timezone': -10800,
 'id': 3874926,
 'name': 'Puerto Williams',
 'cod': 200}

In [18]:
# Now, we're going to add a column with wind speed to the south_5 dataframe.

# This function gets just the windspeed for the required latitude and longitude
def get_wind_speed(latitude, longitude):
    
    response = requests.get(get_weather_api_url(latitude, longitude))
    wind_speed = response.json()["wind"]["speed"]
    
    return wind_speed

In [19]:
# Let's check the windspeed for our selected city. 
# Note: the weather data are close to real-time, so the windspeed may change quite a lot f you run the cell again
get_wind_speed(latitude, longitude)

0.29

In [20]:
# Now, let's get the wind data using a simple loop.
cities_list = south_5.city.tolist()
print(cities_list)

windspeed_list = []

for c_index, city in enumerate(cities_list):
    
    latitude = south_5["lat"].iloc[c_index]
    longitude = south_5["lng"].iloc[c_index]
    wind_speed = get_wind_speed(latitude, longitude)
    
    windspeed_list.append(wind_speed)

print(windspeed_list)

['Puerto Williams', 'Ushuaia', 'King Edward Point', 'Grytviken', 'Río Grande']
[0.29, 7.72, 1.67, 1.66, 8.09]


In [13]:
# this is how you can get the windspeeds using a more "pandas-native" way.
# This can be really helpful when performing more complex operations.

# Lambda functions are very useful for performing the same operation in every element of an iterable.
# More on lambda: https://www.freecodecamp.org/news/python-lambda-function-explained/
# More on pd.DataFrame.apply: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html
# The "axis" parameter decides whether we're working with rows or columns.
windspeeds = south_5.apply(lambda x: get_wind_speed(x["lat"], x["lng"]), axis=1)

# The returned object is a pd.Series, which can also be a single row or column of a DataFrame.
windspeeds

43793    1.54
7018     1.54
909      2.15
44648    2.12
13663    2.57
dtype: float64

In [14]:
# Let's add the windspeed Series as a column to the south_5 DataFrame
south_5["windspeed"] = windspeeds # add a column called windspeed and put the windspeed series there

In [15]:
# Saving the final file
south_5.to_csv("city_windspeed.csv")

In [16]:
# Recap - reading the file.
# When you have a longer data pipeline, you may want to save it every few steps and then read
# the saved partially processed data instead of running the process from the start again.
# It's also good practice to always save processed data under a different name and never edit the original dataset without a copy.

df = pd.read_csv("city_windspeed.csv")

# Now you're ready to do some further data processing!

### Homework
Find a different dataset, perform some operations on it and save it as a new file. Try to find 3 things about it. You can combine it with using an API if you want - try the OpenWeatherAPI, or maybe Spotify, or Twitter?

You can use the datasets from here:
https://moodle.arts.ac.uk/mod/page/view.php?id=1165663

#### Example: Tate collections (https://www.data.gov.uk/dataset/ae739939-2aad-427b-b3b2-66a2bf2954a8/collections-database)

What is the oldest exhibit in the Tate collection?

What is the smallest one?

Which artist has the most artworks in the collection?

<b>Try to use pandas documentation and a search engine to find the methods you need. </b>

https://pandas.pydata.org/pandas-docs/stable/index.html