In [1]:
import os
import sys
import pandas as pd
import json
from datetime import datetime, timedelta
import requests

# Your JSON data

# Specify the file path
file_path = 'data/City_ID/city.list.json'

# Read JSON data from the file
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Create DataFrame
df = pd.json_normalize(data)

# Rename columns
df = df[['city.id.$numberLong', 'city.name']].rename(columns={'city.id.$numberLong': 'city_id', 'city.name': 'city_name'})


# Mảng city_name bạn có
name_to_find = 'Thanh pho Ho Chi Minh'

# id_citys = df[df['city_name'].isin(city_names)]

# Hàm để lấy ID từ tên
def get_id_by_name(name):
    row = df.loc[df['city_name'] == name]
    if not row.empty:
        return row['city_id'].iloc[0]
    else:
        return None

# Kiểm tra

ID_HCM_city = get_id_by_name(name_to_find)

ID_HCM_city

print(ID_HCM_city)

1566083


In [2]:
def loadDataWeatherOneDay(BASE_URL, ID_HCM_city, timestamp, AIP_ID):
    url = f"{BASE_URL}?id={ID_HCM_city}&type=hour&start={timestamp}&appid={AIP_ID}"
    response = requests.get(url)

    if response.status_code == 200 : #and ("message" not in response.json().keys())
        # phản hồi là một mảng chứa hai mảng - [[{page: 1, ...}], [{year: 2018, SP.POP.TOTL: 123455}, ...]]
        # do đó chúng ta kiểm tra xem độ dài của phản hồi có > 1 hay không
        if len(response.json()) > 1:
            data = response.json()

            # Extract the 'list' key from the data
            list_data = data.get('list', [])
            

            # Create DataFrame
            df = pd.json_normalize(list_data)
            num_columns = pd.json_normalize(df['weather'][0]).shape[1]

            # Use json_normalize to flatten the 'weather' column for all rows
            weather_df = pd.concat([pd.json_normalize(weather) for weather in df['weather']], axis=1)
            
            # Reshape the DataFrame
            weather_df = pd.DataFrame(weather_df.values.reshape((-1, num_columns)), columns=weather_df.columns[:num_columns])

            # Concatenate the original DataFrame with the new weather DataFrame
            df = pd.concat([df, weather_df], axis=1)

            # Drop the original 'weather' column
            df = df.drop('weather', axis=1)
            return df


        else:
            # In thông báo lỗi nếu lệnh gọi API không thành công
            print("Error in Loading the data. Status Code: " + str(response.status_code))
            return None


def loadDataWeather(BASE_URL, ID_HCM_city, AIP_ID, start_date, end_date):
    current_date = start_date

    # Khai báo DataFrame để lưu trữ dữ liệu
    df_full = pd.DataFrame()
    
    while current_date <= end_date:
    
        timestamp = int(current_date.timestamp())

        df_dataOneDay = loadDataWeatherOneDay(BASE_URL, ID_HCM_city, timestamp, AIP_ID)
    
        # Concatenate with the full DataFrame
        df_full = pd.concat([df_full, df_dataOneDay], axis=0)
    
        current_date += timedelta(days=1)
    
        
    return df_full

In [3]:

# URL cơ sở được sử dụng trong tất cả các lệnh gọi API
BASE_URL = 'https://history.openweathermap.org/data/2.5/history/city'

AIP_ID = '626e8ec21c8de03a592d15a0f2dca7f9'
# ID_HCM_city = '1566083'

start_date_str = '2022-12-10'
end_date_str = '2023-12-07'

start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
end_date = datetime.strptime(end_date_str, '%Y-%m-%d')


# In thông tin của một số dòng đầu tiên trong DataFrame
df_weather_HCM_city = loadDataWeather(BASE_URL, ID_HCM_city, AIP_ID, start_date, end_date)
df_weather_HCM_city.head()

Unnamed: 0,dt,main.temp,main.feels_like,main.pressure,main.humidity,main.temp_min,main.temp_max,wind.speed,wind.deg,clouds.all,id,main,description,icon,rain.1h,wind.gust
0,1670605000.0,299.16,299.16,1010.0,94.0,299.16,299.16,1.03,0.0,40.0,802,Clouds,scattered clouds,03n,,
1,1670609000.0,299.16,299.16,1009.0,94.0,299.16,299.16,1.03,0.0,40.0,802,Clouds,scattered clouds,03n,,
2,1670612000.0,298.16,299.33,1009.0,100.0,298.16,298.16,1.03,0.0,40.0,802,Clouds,scattered clouds,03n,,
3,1670616000.0,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,20.0,40.0,802,Clouds,scattered clouds,03n,,
4,1670620000.0,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,50.0,40.0,802,Clouds,scattered clouds,03n,,


In [4]:
df_weather_HCM_city.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9720 entries, 0 to 23
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   dt               8712 non-null   float64
 1   main.temp        8712 non-null   float64
 2   main.feels_like  8712 non-null   float64
 3   main.pressure    8712 non-null   float64
 4   main.humidity    8712 non-null   float64
 5   main.temp_min    8712 non-null   float64
 6   main.temp_max    8712 non-null   float64
 7   wind.speed       8712 non-null   float64
 8   wind.deg         8712 non-null   float64
 9   clouds.all       8712 non-null   float64
 10  id               8775 non-null   object 
 11  main             8775 non-null   object 
 12  description      8775 non-null   object 
 13  icon             8775 non-null   object 
 14  rain.1h          926 non-null    float64
 15  wind.gust        115 non-null    float64
dtypes: float64(12), object(4)
memory usage: 1.3+ MB


In [5]:
df_weather_HCM_city = df_weather_HCM_city.rename(columns={'dt': 'datetime', 
                                                            'main.temp': 'temp (K)',
                                                            'main.feels_like': 'feels_like',
                                                            'main.pressure': 'pressure',
                                                            'main.humidity': 'humidity',
                                                            'main.temp_min': 'temp_min',
                                                            'main.temp_max': 'temp_max',
                                                            'wind.speed': 'wind_speed',
                                                            'wind.deg': 'wind_deg',
                                                            'clouds.all': 'clouds_all',
                                                            'id': 'id_weatrher',
                                                            'main': 'main_weatrher',
                                                            'description': 'description_weatrher',
                                                            'icon': 'icon_weatrher',
                                                            'rain.1h': 'rain_1h',
                                                            'wind.gust': 'wind_gust',
                                                           })

In [6]:
df_weather_HCM_city.head()

Unnamed: 0,datetime,temp (K),feels_like,pressure,humidity,temp_min,temp_max,wind_speed,wind_deg,clouds_all,id_weatrher,main_weatrher,description_weatrher,icon_weatrher,rain_1h,wind_gust
0,1670605000.0,299.16,299.16,1010.0,94.0,299.16,299.16,1.03,0.0,40.0,802,Clouds,scattered clouds,03n,,
1,1670609000.0,299.16,299.16,1009.0,94.0,299.16,299.16,1.03,0.0,40.0,802,Clouds,scattered clouds,03n,,
2,1670612000.0,298.16,299.33,1009.0,100.0,298.16,298.16,1.03,0.0,40.0,802,Clouds,scattered clouds,03n,,
3,1670616000.0,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,20.0,40.0,802,Clouds,scattered clouds,03n,,
4,1670620000.0,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,50.0,40.0,802,Clouds,scattered clouds,03n,,


In [7]:

df_weather_HCM_city.to_csv("data/raw_data.csv",sep = ',', encoding= 'utf-8', index=False) 