In [1]:
import pandas as pd

In [5]:
country_code_to_name = {
    "at": "Austria", "be": "Belgium", "ca": "Canada", "ch": "Switzerland",
    "cz": "Czech Republic", "de": "Germany", "dk": "Denmark", "es": "Spain",
    "fi": "Finland", "fr": "France", "gb": "United Kingdom", "gr": "Greece",
    "hu": "Hungary", "ie": "Ireland", "is": "Iceland", "it": "Italy",
    "jp": "Japan", "mx": "Mexico", "nl": "Netherlands", "no": "Norway",
    "pl": "Poland", "pt": "Portugal", "se": "Sweden", "sk": "Slovakia",
    "tr": "Turkey", "us": "United States"
}

country_codes = list(country_code_to_name.keys())

country_names = list(country_code_to_name.values())

columns_to_keep = [
    'date', 'region', 'chart', 'streams', 'af_danceability',
    'af_energy', 'af_key', 'af_loudness', 'af_speechiness', 
    'af_acousticness', 'af_valence', 'af_tempo'
]


In [13]:
def preprocessing(data_frame):
    data_frame['date'] = pd.to_datetime(data_frame['date'])
    
    # filter by date
    date_filter = (data_frame['date'] >= '2018-01-01') & (data_frame['date'] <= '2019-12-31')

    # filter by countries names
    region_filter = data_frame['region'].isin(country_names)
    
    data_frame = data_frame[date_filter & region_filter]

    # selection of wanted columns
    data_frame = data_frame[columns_to_keep]

    # filter by chart type
    data_frame = data_frame[data_frame['chart'] == 'top200']

    return data_frame

def read_big_file(file_name):

    for code_chunk in pd.read_csv(file_name, chunksize = 10000):
        yield code_chunk


In [14]:
output_file = 'processed_data.csv'

header_written = False

for data_frame in read_big_file('merged_data.csv'):
    processed_data = preprocessing(data_frame)

    if processed_data.empty:
        continue

    if not header_written:
        processed_data.to_csv(output_file, mode='w', index=False)
        header_written = True
    else:
        processed_data.to_csv(output_file, mode='a', header=False, index=False)


In [17]:
processed_df = pd.read_csv('processed_data.csv')

# qualitative check if we have the same number of countries - its OK, 26 countries
len(processed_df['region'].unique())

26

Final preparation of aggregated by month and country data.

In [18]:
processed_df['date'] = pd.to_datetime(processed_df['date'])

processed_df['year_month'] = processed_df['date'].dt.strftime('%Y%m')

columns_to_average = [
    'af_danceability', 'af_energy', 'af_key', 'af_loudness', 
    'af_speechiness', 'af_acousticness', 'af_valence', 'af_tempo'
]

# group by 'region' and 'year_month' to calculate aggregate values
aggregated_df = processed_df.groupby(['region', 'year_month']).agg({
    'streams': 'sum',
    **{col: 'mean' for col in columns_to_average}
}).reset_index()

aggregated_df.to_csv('aggregated_monthly_data.csv', index=False)