<a href="https://colab.research.google.com/github/JarekMaleszyk/pandas-project-data/blob/main/random_csv_file_sparring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
# !pip install --upgrade pandas==2.1.4
# !pip install --upgrade  numpy<2.0.0,>=1.19.0

import pandas as pd
import numpy as np
print(f'Pandas version: {pd. __version__}')
print(f'Numpy version: {np.__version__}')

In [None]:
!rm -rf "/content/international_trade_march"
DIRECTORY='international_trade_march'
import os
if not os.path.exists(DIRECTORY):
    os.makedirs(DIRECTORY)
    print(f"Directory '{DIRECTORY}' created successfully.")

In [None]:
import os
ZIP_FILE_NAME = 'international-trade-march-2024-quarter.zip'
if os.path.exists(ZIP_FILE_NAME):
  os.remove(ZIP_FILE_NAME)
else:
  print(f"The file {ZIP_FILE_NAME} does not exist")

!wget -P "/content/international_trade_march" "https://www.stats.govt.nz/assets/Uploads/International-trade/International-trade-March-2024-quarter/Download-data/international-trade-march-2024-quarter.zip"

In [None]:
!unzip -o "/content/international_trade_march/international-trade-march-2024-quarter.zip" -d "/content/international_trade_march"

Read and clean main data from csv:

In [None]:
try:
  output_csv_full = pd.read_csv('/content/international_trade_march/output_csv_full.csv', header=0)
  print('File read successfully')
except FileNotFoundError:
  print('File not found.')

In [None]:
output_csv_full.product_type = output_csv_full.product_type.astype('category')
# output_csv_full.product_type.cat.categories
output_csv_full.country_code = output_csv_full.country_code.astype('category')
# output_csv_full.country_code.cat.categories

In [None]:
print(output_csv_full.head())

In [None]:
# output_csv_full.loc[202403]
# output_csv_full[(output_csv_full['time_ref'] == 202403) & (output_csv_full['product_type'] == 'Goods')][['time_ref', 'country_code', 'value']]
duplicate_rows = output_csv_full.duplicated()
print("Number of duplicate rows:", duplicate_rows.sum())

In [None]:
# ?str.replace
# output_csv_full['value'] = output_csv_full['value'].fillna(0, inplace=True)
rows_with_nulls = output_csv_full.isna()
print("Number of duplicate rows:", rows_with_nulls.sum())

In [None]:
# output_csv_full.sort_values(by=['value'], inplace=True, ascending=False)

In [None]:
print(output_csv_full.country_code.isna().value_counts())

In [None]:
output_csv_full.dropna(subset=['country_code'], inplace=True) #drop wierszy z nullami w kolumnie country_code
print(output_csv_full.country_code.isna().value_counts())

In [None]:
output_csv_full[output_csv_full.value.isna() == True]

In [None]:
output_csv_full.value = output_csv_full.value.fillna(0) #0 zamiast null/nan
output_csv_full.value.info()

In [None]:
output_csv_full['year'] = output_csv_full['time_ref'].astype(str).str[:4].astype(int)
output_csv_full['month'] = output_csv_full['time_ref'].astype(str).str[4:6].astype(int)

In [None]:
output_csv_full['date'] = pd.to_datetime(output_csv_full[['year', 'month']].assign(day=1))

In [None]:
try:
  output_csv_full = output_csv_full.drop(columns=['time_ref', 'year', 'month'])
except:
  print('Columns already dropped')

Read and clean countries dictionary from csv:

In [None]:
try:
  country_classification = pd.read_csv('/content/international_trade_march/country_classification.csv', header=0)
  print('File read successfully')
except FileNotFoundError:
  print('File not found.')

In [None]:
country_classification.country_code = country_classification.country_code.astype('category')
country_classification.info(show_counts=True)

Merge and clean:

In [None]:
merged_data = output_csv_full.merge(country_classification, on='country_code', how='left')

In [None]:
merged_data.drop(merged_data[merged_data.country_label.isna() == True].index, inplace=True) #usunięcie niespasowanych rekordów

In [None]:
merged_data = merged_data.reset_index(drop=True)

Read and clean goods classification dictionary from csv:

In [None]:
try:
  goods_classification = pd.read_csv('/content/international_trade_march/goods_classification.csv', header=0)
  print('File read successfully')
except FileNotFoundError:
  print('File not found.')

Read and clean services classification dictionary from csv:

In [None]:
try:
  services_classification = pd.read_csv('/content/international_trade_march/services_classification.csv', header=0)
  print('File read successfully')
except FileNotFoundError:
  print('File not found.')

In [None]:
services_classification.code = services_classification.code.astype('category')

Merge:

In [None]:
merged_data = merged_data.merge(services_classification, on='code', how='left')

In [None]:
merged_data.loc[merged_data.service_label.isna() == True, 'service_label'] = 'Unknown' #fill nulls with 'Unknown'

In [None]:
merged_data.info(show_counts=True)

In [None]:
merged_data_exports = merged_data[merged_data['account'] == 'Exports']

In [None]:
merged_data_imports = merged_data[merged_data['account'] == 'Imports']

Create export/import analysis per every month

In [None]:
# merged_data.date.min() #Timestamp('2014-06-01 00:00:00')
# merged_data.date.max() #Timestamp('2024-03-01 00:00:00')
merged_data_exports_sum = merged_data_exports.groupby(['country_code','date'])['value'].sum().reset_index()
merged_data_exports_sum = merged_data_exports_sum.rename(columns={'value': 'export_value'})
merged_data_imports_sum = merged_data_imports.groupby(['country_code','date'])['value'].sum().reset_index()
merged_data_imports_sum = merged_data_imports_sum.rename(columns={'value': 'import_value'})

In [None]:
start, stop = merged_data.date.min(), merged_data.date.max()
canvas: pd.DataFrame =  pd.date_range(start, stop, freq="MS") #tworzy kalendarz
canvas = pd.DataFrame(canvas, columns=['date'])

In [None]:
data_with_export_sum = canvas.merge(merged_data_exports_sum, on=['date'], how='left')
data_with_export_sum.export_value = data_with_export_sum.export_value.fillna(0)
data_with_export_import_sum = data_with_export_sum.merge(merged_data_imports_sum, on=['country_code', 'date'], how='left')
data_with_export_import_sum.import_value = data_with_export_import_sum.import_value.fillna(0)

In [None]:
data_with_export_import_sum['income'] = data_with_export_import_sum.export_value - data_with_export_import_sum.import_value

In [None]:
data_with_export_import_sum[data_with_export_import_sum['country_code'] == 'AD']

In [None]:
from matplotlib import pyplot as plt
data_with_export_import_sum[data_with_export_import_sum['country_code'] == 'AD'].plot(kind='line',
                                                                                      x='date',
                                                                                      y='income',
                                                                                      figsize=(10, 6),
                                                                                      title='income',
                                                                                      linestyle='dashdot')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
case_test_dataframe = data_with_export_import_sum.copy()

In [None]:
case_test_dataframe['summary_income'] = np.where(case_test_dataframe['income'] >= 80000, 'Very good.',
                                        np.where((case_test_dataframe['income'] < 80000) & (case_test_dataframe['income'] > 50000), 'Good.',
                                        np.where((case_test_dataframe['income'] <= 50000) & (case_test_dataframe['income'] > 20000), 'Medium.',
                                        np.where(case_test_dataframe['income'] <=  20000, 'Low', 'None'))))

case_test_dataframe