In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shutil
import os
from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Remove previous files
!rm -rf dataset.zip *.csv

# Copy the dataset
!cp drive/MyDrive/Uni/07/DataMining/dataset.zip ./

# Unzip the dataset
!unzip -q dataset.zip

### Question 1

In [None]:
product_instance = pd.read_csv('PRODUCTINSTANCE.csv', low_memory=False, encoding='cp1258')
product_instance = product_instance[['ISCONTROLAMVAL', 'ACCT_AC_HOLDING_ID', 'COSTLINE']]

In [None]:
product_instance.dropna(inplace=True)
product_instance = product_instance[(product_instance['ISCONTROLAMVAL'] == 'N') | (product_instance['ISCONTROLAMVAL'] == 'Y')]

In [None]:
product_instance.reset_index(inplace=True, drop=True)
product_instance['IS_CONTROLLED'] = product_instance['ISCONTROLAMVAL'] == 'Y'
total_controlled = (product_instance['ISCONTROLAMVAL'] == 'Y').sum()
total_uncontrolled = (product_instance['ISCONTROLAMVAL'] == 'N').sum()
ratio = total_uncontrolled / (total_uncontrolled + total_controlled)
print(f'Total Controlled Products: {total_controlled}')
print(f'Total Uncontrolled Products: {total_uncontrolled}')
print(f'Ratio of Uncontrolled Products: {ratio}')

Total Controlled Products: 95675
Total Uncontrolled Products: 30720
Ratio of Uncontrolled Products: 0.24304758890778907


In [None]:
amin_amval_groups = product_instance.groupby(by='COSTLINE')['IS_CONTROLLED']
amin_amvals = pd.DataFrame({
    'Uncontrolled Percentage': 1 - (amin_amval_groups.sum() / amin_amval_groups.count()),
    'Uncontrolled': amin_amval_groups.count() - amin_amval_groups.sum(),
    'Total': amin_amval_groups.count()
})
amin_amvals = amin_amvals.sort_values(by='Uncontrolled Percentage', ascending=False)
amin_amvals
# print(amin_amvals.to_string())

Unnamed: 0_level_0,Uncontrolled Percentage,Uncontrolled,Total
COSTLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
105000390,1.000000,3,3
105003106,1.000000,1,1
105003208,1.000000,9,9
105003348,1.000000,8,8
105003590,1.000000,4,4
...,...,...,...
300001364,0.032382,101,3119
300171539,0.012470,63,5052
300005012,0.006491,178,27423
105004497,0.000000,0,725


In [None]:
amin_amval_groups = product_instance.groupby(by='ACCT_AC_HOLDING_ID')['IS_CONTROLLED']
amin_amvals = pd.DataFrame({
    'Uncontrolled Percentage': 1 - (amin_amval_groups.sum() / amin_amval_groups.count()),
    'Uncontrolled': amin_amval_groups.count() - amin_amval_groups.sum(),
    'Total': amin_amval_groups.count()
})
amin_amvals = amin_amvals.sort_values(by='Uncontrolled Percentage', ascending=False)
amin_amvals
# print(amin_amvals.to_string())

Unnamed: 0_level_0,Uncontrolled Percentage,Uncontrolled,Total
ACCT_AC_HOLDING_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.380986,29134,76470
3,0.03868,1231,31825
4,0.025857,328,12685
2,0.005348,2,374
5,0.004963,25,5037
21,0.0,0,4


### Question 2

In [None]:
def find_non_finals(df, cols):
    df.dropna(inplace=True)
    df['C_DOCSTATUS_ID'] = df['C_DOCSTATUS_ID'].astype('int')
    result = df[(df['C_DOCSTATUS_ID'] == 3000018) | (df['C_DOCSTATUS_ID'] == 3000006)]
    result.reset_index(inplace=True)
    return result[cols]

In [None]:
cols = ['C_DOCSTATUS_ID', 'INOUT_ID', 'DOCUMENTNO']
inout = pd.read_csv('INOUT.csv', low_memory=False)[cols]
find_non_finals(inout, cols)

Unnamed: 0,C_DOCSTATUS_ID,INOUT_ID,DOCUMENTNO
0,3000006,469639217,36.0
1,3000006,469640047,107.0
2,3000006,469640048,108.0
3,3000006,469640049,109.0
4,3000006,469640796,2992.0
5,3000006,469641285,3481.0
6,3000006,469642220,4411.0
7,3000006,469643504,5695.0
8,3000006,469644634,6744.0
9,3000006,469645454,6976.0


In [None]:
cols = ['C_DOCSTATUS_ID', 'TRANSFER_ITEM_ID', 'DOCUMENTNO']
transfer_item = pd.read_csv('TRANSFER_ITEM.csv', low_memory=False)[cols]
find_non_finals(transfer_item, cols)

Unnamed: 0,C_DOCSTATUS_ID,TRANSFER_ITEM_ID,DOCUMENTNO
0,3000006,1009561,19412.0
1,3000006,1010053,19820.0
2,3000006,1010067,19830.0
3,3000006,1010167,19895.0
4,3000006,1010188,19897.0
...,...,...,...
77,3000006,1016193,23002.0
78,3000006,1016196,23005.0
79,3000006,1016197,23006.0
80,3000006,1016199,23008.0


### Question 3

In [None]:
# Load tables
inout = pd.read_csv('INOUT.csv', low_memory=False)[['COM_BPARTNER_ID', 'INOUT_ID']]
inoutline = pd.read_csv('INOUTLINE.csv', low_memory=False)[['PRIMALVALUE', 'INOUT_ID']]

# Drop NaNs
inout.dropna(inplace=True)
inoutline.dropna(inplace=True)

# Drop Duplicates
inout.drop_duplicates(subset='INOUT_ID', keep="last", inplace=True)
inoutline.drop_duplicates(subset='INOUT_ID', keep="last", inplace=True)

# Join tables
joined_tb = pd.merge(inout, inoutline, how='inner', on='INOUT_ID')
joined_tb

# Sum the total value for each person
values = joined_tb.groupby(by='COM_BPARTNER_ID')['PRIMALVALUE'].sum().to_dict()

# Report the results
print('Amin Amval (ID) - Total Value')
print('=============================')
for amin_amval, value in values.items():
    amin_amval, value = int(amin_amval), int(value)
    print(f'{str(amin_amval).ljust(15)} - {value}')

Amin Amval (ID) - Total Value
0               - 42300000
1818            - 39240000
105000811       - 27881000
105001127       - 479827650
105001250       - 0
105001419       - 98328632
105002632       - 217728000
105002636       - 73638472
105002817       - 85500000
105002901       - 15799526978
105002922       - 61926845
105003052       - 59864144
105003348       - 82662367
105003455       - 7245941471
105003567       - 318650500
105003577       - 129183261936
105003935       - 533083959
105003986       - 21678836870
105004148       - 102575144
105004462       - 319182940
105004497       - 28686505000
105004794       - 31419526166
105004849       - 76959116
105004868       - 185633170
210032640       - 1520000
210032655       - 3638827792
210619283       - 4183295405
300001360       - 1982492305
300001361       - 4860000
300001364       - 2623393402
300001382       - 24330000
300001392       - 3544586782
300002668       - 2850000
300003210       - 18253167236
300005012       - 397345