In [1]:
import os
import pandas as pd

## Input-Output File Directories

In [2]:
input_file_dir_1 = "./nonfin 2020 2021.txt"
input_file_dir_2 = "./nonfin 2022 2023.txt"
output_file_dir = "./nonfin_2020_2023_striped.csv"

## Produce a Striped CSV file
- Fetch each row from input_file_dir and directly write them to output_file_dir
- Efficient constant memory complexity, O(1), accross the process
- Each columns from input_file is striped to remove leading and trailing spaces

In [3]:
def clean(input_file_dir, output_file_dir, mode):
    flag = True
    with open(input_file_dir, "r") as input_file, open(output_file_dir, mode) as output_file:
        for row in input_file:
            if flag and mode == "a":
                flag = False
                continue
            columns = [data.strip() for data in row.split("|")] # get the columns in the form of list
            columns[0] = columns[0].split(" ")[0]       # dt, only take the date part, ignore the time
            columns[3] = columns[3].replace(",", "")    # branch_key, remove the comma
            columns[4] = columns[4].replace(",", "")    # sumtrx, remove the comma
            
            output_file.write("|".join(columns) + "\n")

In [4]:
clean(input_file_dir_1, output_file_dir, "w")
clean(input_file_dir_2, output_file_dir, "a")

## Attempt to Read the CSV
- only the 100 first rows is read just to make sure that the format is correct and readable

In [5]:
test = pd.read_csv(output_file_dir, header=0, delimiter="|", dtype=str, nrows=100)
test

Unnamed: 0,dt,trans_type_key,trans_lng_nm,branch_key,sumtrx,branch_address1,branch_address2,region_key,jabo_nonjabotabek,city_key,geography,flag_big_city
0,2020-10-20,435,Pemesanan Warkat,838,3,SENTRA NIAGA HARAPAN INDAH KAV. CP2 NO.1 KOTA ...,17214,9,J,24,JAKARTA,B
1,2021-04-19,727,Cetak Mutasi Harian,259,2,JL. MUWARDI 1/44,11450,10,J,24,,B
2,2021-05-03,482,Buka Blokir Kartu,1074,2,"KOMP. PERUM TAMAN PALEM LESTARI C5 NO. 9 -11, ...",11730,12,J,24,,B
3,2021-03-08,383,Registrasi m-BCA,196,22,JL. TRUNOJOYO 145,68137,7,N,20,,K
4,2021-11-09,436,Penawaran Solusi,1336,1,"JL. RADEN SALEH NO. 39B, KEL. KARANG MULYA, KE...",15157,12,J,24,,B
...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-01-28,430,Cetak Mutasi Bulanan,327,25,"GEDUNG TANAH ABANG ZONA III LANTAI 3A L03A, JL...",10250,12,J,24,,B
96,2020-01-15,439,Cetak Portofolio Nasabah,30,14,JL. A YANI 91,59317,2,N,295,SEMARANG,K
97,2021-07-23,437,Penggantian Kartu,455,10,JL. NGINDEN SEMOLO / SEMOLOWARU 101 / 18-19,60119,3,N,131,,B
98,2021-02-16,480,Reset Kesalahan PIN,406,3,JL. RAYA DARMA GIRI NO. 88,80511,4,N,30,,B


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   dt                 100 non-null    object
 1   trans_type_key     100 non-null    object
 2   trans_lng_nm       100 non-null    object
 3   branch_key         100 non-null    object
 4   sumtrx             100 non-null    object
 5   branch_address1    100 non-null    object
 6   branch_address2    100 non-null    object
 7   region_key         100 non-null    object
 8   jabo_nonjabotabek  100 non-null    object
 9   city_key           100 non-null    object
 10  geography          17 non-null     object
 11  flag_big_city      100 non-null    object
dtypes: object(12)
memory usage: 9.5+ KB
