# Getting Started

This netebook contains some minimal setup code to get started with the data analysis, including the installation of the required libraries and data loading fraom files.

### Install some required libraries

Depending on the system configuration it may be necessary to install more libraries.

### Shared functions loading
Load a library of shared functions that may be used for the analsys if the data. If any part of the loading fails, this may be fixed by installing more libraries in the cell above.

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
def read_transaction_data(DIR_INPUT, BEGIN_DATE, END_DATE):
    
    files = [os.path.join(DIR_INPUT, f) for f in os.listdir(DIR_INPUT) if f>=BEGIN_DATE+'.pkl' and f<=END_DATE+'.pkl']

    frames = []
    for f in files:
        df = pd.read_pickle(f)
        frames.append(df)
        del df
    df_final = pd.concat(frames)
    
    df_final=df_final.sort_values('TRANSACTION_ID')
    df_final.reset_index(drop=True,inplace=True)
    #  Note: -1 are missing values for realpi world data 
    df_final=df_final.replace([-1],0)
    
    #drop the TX_DATETIME column because ai model only works with numbers
    return df_final

### Load data
Using the shared_functions.py library, load the input data into a Pandas dataframe 

In [3]:
# Load data from the 2018-07-25 to the 2018-08-14

DIR_INPUT='../simulated-training-data-raw' 

BEGIN_DATE = "2018-07-25"
END_DATE = "2018-08-14"

print("Load  files...")
transactions_df=read_transaction_data(DIR_INPUT, BEGIN_DATE, END_DATE)
print("Done")


Load  files...
Done


In [4]:
transactions_df

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
0,1102483,2018-07-25 00:00:29,1111,2328,40.77,9936029,115,0,0
1,1102484,2018-07-25 00:01:08,676,6846,9.62,9936068,115,0,0
2,1102485,2018-07-25 00:01:35,402,4771,81.55,9936095,115,0,0
3,1102486,2018-07-25 00:01:43,4218,863,23.10,9936103,115,0,0
4,1102487,2018-07-25 00:02:26,3711,3599,59.25,9936146,115,0,0
...,...,...,...,...,...,...,...,...,...
201290,1303773,2018-08-14 23:57:03,460,6133,16.72,11750223,135,0,0
201291,1303774,2018-08-14 23:58:24,3101,3229,38.16,11750304,135,0,0
201292,1303775,2018-08-14 23:58:24,4783,7511,69.85,11750304,135,0,0
201293,1303776,2018-08-14 23:58:45,2883,8550,14.99,11750325,135,0,0


In [5]:
transactions_df["TX_DATETIME"] = transactions_df["TX_DATETIME"].astype(np.int64)

In [6]:
train, test = train_test_split(transactions_df, test_size=0.2)

In [7]:
X_train = train[["TRANSACTION_ID","TX_DATETIME","CUSTOMER_ID","TERMINAL_ID","TX_AMOUNT","TX_TIME_SECONDS","TX_TIME_DAYS"]]

In [8]:
X_test = test[["TRANSACTION_ID","TX_DATETIME","CUSTOMER_ID","TERMINAL_ID","TX_AMOUNT","TX_TIME_SECONDS","TX_TIME_DAYS"]]

In [9]:
Y_train = train["TX_FRAUD"]

In [10]:
Y_test = test["TX_FRAUD"]

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [13]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)


In [14]:
y_pred = knn.predict(X_test)

accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9919


In [23]:
# Load data from the 2018-07-25 to the 2018-08-14

DIR_INPUT='../training-data/input' 

BEGIN_DATE = "2018-04-01"
END_DATE = "2018-09-30"

print("Load  files...")
input_training_data=read_transaction_data(DIR_INPUT, BEGIN_DATE, END_DATE)
print("Done")


Load  files...
Done


In [16]:
input_training["TX_DATETIME"] = input_training["TX_DATETIME"].astype(np.int64)

In [17]:
input_training = scaler.fit_transform(input_training)


In [18]:
input_training.shape

(1754155, 7)

In [24]:
y_pred.shape

(1754155,)

In [20]:
y_pred = knn.predict(input_training)

In [21]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
input_training_data["prediction"] = y_pred

In [26]:
input_training_data

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,prediction
0,0,2018-04-01 00:00:31,596,3156,57.16,31,0,0
1,1,2018-04-01 00:02:10,4961,3412,81.51,130,0,0
2,2,2018-04-01 00:07:56,2,1365,146.00,476,0,0
3,3,2018-04-01 00:09:29,4128,8737,64.49,569,0,0
4,4,2018-04-01 00:10:34,927,9906,50.99,634,0,0
...,...,...,...,...,...,...,...,...
1754150,1754150,2018-09-30 23:56:36,161,655,54.24,15810996,182,0
1754151,1754151,2018-09-30 23:57:38,4342,6181,1.23,15811058,182,0
1754152,1754152,2018-09-30 23:58:21,618,1502,6.62,15811101,182,0
1754153,1754153,2018-09-30 23:59:52,4056,3067,55.40,15811192,182,0


In [27]:
filtered_df = input_training_data[input_training_data['prediction'] == 1]

In [28]:
filtered_df

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,prediction
3527,3527,2018-04-01 10:17:43,3774,3059,225.41,37063,0,1
5790,5790,2018-04-01 13:31:48,4944,6050,222.26,48708,0,1
6549,6549,2018-04-01 14:42:02,4625,9102,226.40,52922,0,1
10749,10749,2018-04-02 06:06:00,3770,5671,264.95,108360,1,1
11556,11556,2018-04-02 07:46:45,1863,4841,265.17,114405,1,1
...,...,...,...,...,...,...,...,...
1750457,1750457,2018-09-30 13:43:56,4314,5774,386.35,15774236,182,1
1750861,1750861,2018-09-30 14:19:26,3280,1405,427.50,15776366,182,1
1751380,1751380,2018-09-30 15:05:02,3516,6801,569.40,15779102,182,1
1752248,1752248,2018-09-30 16:36:25,1771,7763,375.95,15784585,182,1


In [29]:
print(sum(y_pred))

3062


In [30]:
ids = filtered_df["TRANSACTION_ID"]

In [31]:
ids

3527          3527
5790          5790
6549          6549
10749        10749
11556        11556
            ...   
1750457    1750457
1750861    1750861
1751380    1751380
1752248    1752248
1753081    1753081
Name: TRANSACTION_ID, Length: 3062, dtype: int64

In [44]:
def write_list_to_text_file(filename, input_list):
    try:
        with open(filename, 'w') as file:
            for item in input_list:
                file.write(str(item) + '\n')
        print(f"Successfully wrote {len(input_list)} items to {filename}")
    except IOError as e:
        print(f"Error writing to {filename}: {e}")

In [33]:
lst = ids.to_numpy()

In [38]:
lst = lst.reshape(1,-1)

In [41]:
lst = lst.tolist()

In [42]:
lst

[[3527,
  5790,
  6549,
  10749,
  11556,
  15257,
  18260,
  23511,
  26719,
  28150,
  30183,
  30439,
  31579,
  33277,
  36625,
  37136,
  38021,
  43829,
  43956,
  44519,
  47090,
  47355,
  49044,
  49535,
  49563,
  50116,
  50568,
  50767,
  50862,
  51196,
  51812,
  52458,
  52729,
  53149,
  55431,
  55761,
  55994,
  56310,
  56659,
  57653,
  59148,
  59566,
  61052,
  61762,
  63254,
  64339,
  64412,
  64505,
  65694,
  65939,
  66864,
  67690,
  67727,
  69251,
  69714,
  70184,
  70294,
  70583,
  70781,
  71314,
  71435,
  72615,
  72912,
  73116,
  73806,
  74616,
  75563,
  75647,
  75947,
  76455,
  76689,
  77379,
  77562,
  77624,
  78714,
  79370,
  79769,
  80385,
  80772,
  82896,
  83939,
  84256,
  84645,
  85920,
  86337,
  86367,
  86742,
  86951,
  87061,
  87218,
  87584,
  88132,
  88978,
  89120,
  89365,
  89521,
  89789,
  90550,
  91322,
  91973,
  92580,
  94232,
  94842,
  94854,
  95160,
  95388,
  95411,
  96052,
  96683,
  97000,
  97838,
  97

In [45]:
write_list_to_text_file("output.txt",lst[0])

Successfully wrote 3062 items to output.txt


In [48]:

# Path to the directory containing the txt files
source_directory = "../training-data/output"
# Path to the destination file
destination_file_path = './solution.txt'

# List all files in the source directory
files = os.listdir(source_directory)

# Filter to keep only .txt files
txt_files = [f for f in files if f.endswith('.txt')]

# Open the destination file
with open(destination_file_path, 'a') as dest_file:
    for txt_file in txt_files:
        # Construct the full path
        file_path = os.path.join(source_directory, txt_file)
        
        # Open the source file and append its content to the destination file
        with open(file_path, 'r') as source_file:
            content = source_file.read()
            dest_file.write(content + '\n')  # Add a newline for separation
