In [204]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
train_data = pd.read_csv('data/training.csv')
test_data = pd.read_csv('data/test.csv')


# Separate target from predictors
y = train_data.FraudResult
X = train_data.drop(['FraudResult'], axis=1)


X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                           random_state=0)


# train_data.head(5)


In [205]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [206]:
X_valid.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
69075,TransactionId_91392,BatchId_117569,AccountId_834,SubscriptionId_3759,CustomerId_1179,UGX,256,ProviderId_1,ProductId_3,airtime,ChannelId_3,2000.0,2000,2019-01-23T11:50:31Z,4
65709,TransactionId_119416,BatchId_24161,AccountId_4127,SubscriptionId_4240,CustomerId_4579,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1110.0,1110,2019-01-19T17:11:53Z,2
5430,TransactionId_124012,BatchId_51521,AccountId_190,SubscriptionId_1897,CustomerId_513,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,2000.0,2000,2018-11-22T06:34:07Z,2
82375,TransactionId_12251,BatchId_126898,AccountId_4840,SubscriptionId_3829,CustomerId_3212,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-01T17:45:07Z,2
60896,TransactionId_27059,BatchId_17843,AccountId_1579,SubscriptionId_821,CustomerId_1956,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,1000.0,1000,2019-01-15T12:21:53Z,2


In [207]:
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

onehot_cols = ['ProductCategory']


In [208]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))

])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, onehot_cols)
    ])

In [209]:
from sklearn.metrics import f1_score, recall_score,accuracy_score


def compute_imbalanced_scores(model_preds):
    f1 = f1_score(y_valid, model_preds)
    recall = recall_score(y_valid, model_preds)
    accuracy = accuracy_score(y_valid, model_preds)
    # f1 = (2*recall*accuracy)/(recall + accuracy)
    return f" Accuracy score: {accuracy} % \n F1 score: {f1} % \n Recall score: {recall} "

# Using preprocessing 

In [210]:
underscore_columns = ['TransactionId','BatchId','AccountId','SubscriptionId','CustomerId','ProviderId','ProductId','ChannelId']


for cname in X_train.columns:
    if cname in underscore_columns:
        train_number = X_train[cname].str.split('_').str[1]
        valid_number = X_valid[cname].str.split('_').str[1]
        X_train[cname] =  train_number
        X_valid[cname] = valid_number
X_train.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
66339,140085,114864,4841,3829,4827,UGX,256,4,6,financial_services,2,-50.0,50,2019-01-20T16:21:05Z,2
87279,64558,39429,3981,910,4431,UGX,256,5,3,airtime,3,1000.0,1000,2019-02-06T18:25:51Z,4
40582,134904,1673,135,3595,457,UGX,256,6,21,utility_bill,3,15000.0,16650,2018-12-26T04:17:14Z,2
58655,95030,133112,4840,3829,582,UGX,256,4,6,financial_services,2,-1000.0,1000,2019-01-12T15:57:39Z,2
87335,75383,7649,4841,3829,3328,UGX,256,4,6,financial_services,2,-50.0,50,2019-02-06T19:05:15Z,2


In [211]:
X_valid.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
69075,91392,117569,834,3759,1179,UGX,256,1,3,airtime,3,2000.0,2000,2019-01-23T11:50:31Z,4
65709,119416,24161,4127,4240,4579,UGX,256,6,10,airtime,3,1110.0,1110,2019-01-19T17:11:53Z,2
5430,124012,51521,190,1897,513,UGX,256,6,10,airtime,3,2000.0,2000,2018-11-22T06:34:07Z,2
82375,12251,126898,4840,3829,3212,UGX,256,4,6,financial_services,2,-1000.0,1000,2019-02-01T17:45:07Z,2
60896,27059,17843,1579,821,1956,UGX,256,6,3,airtime,3,1000.0,1000,2019-01-15T12:21:53Z,2


### a bit of feature engeneering

In [212]:
def create_credit_debit_cols(amount):
    if amount < 0:
        return pd.Series({'Credit': 1, 'Debit': 0})
    else:
        return pd.Series({'Credit': 0, 'Debit': 1})

In [213]:
# Apply the function to 'Amount' column to create 'Credit' and 'Debit' columns
X_train[['Credit', 'Debit']] = X_train['Amount'].apply(lambda x: create_credit_debit_cols(x))
X_valid[['Credit', 'Debit']] = X_valid['Amount'].apply(lambda x: create_credit_debit_cols(x))

In [None]:
X_train[["StartDate", "StartTime"]] = X_train["TransactionStartTime"].str.split("T", expand=True)     
X_valid[["StartDate", "StartTime"]] = X_valid["TransactionStartTime"].str.split("T", expand=True)  

X_train["StartMonth"] = X_train["StartDate"].str.split("Z").str[0].str.split('-').str[1]
X_valid["StartMonth"] = X_valid["StartDate"].str.split("Z").str[0].str.split('-').str[1]


X_train = X_train.drop('TransactionStartTime',axis=1)
X_valid= X_valid.drop('TransactionStartTime',axis=1)


# feature engeneering using mutual information showed that these two columns have no impact on FraudResult
# added temporary other features starting from BatchId
useless_columns = ['CurrencyCode','CountryCode','StartTime','StartDate','Amount','BatchId','AccountId','SubscriptionId','CustomerId','ProviderId']
X_train = X_train.drop(useless_columns,axis=1)
X_valid= X_valid.drop(useless_columns,axis=1)



In [None]:
X_train.head()

Unnamed: 0,ProductId,ProductCategory,ChannelId,Value,PricingStrategy,Credit,Debit,StartMonth
66339,6,financial_services,2,50,2,1,0,1
87279,3,airtime,3,1000,4,0,1,2
40582,21,utility_bill,3,16650,2,0,1,12
58655,6,financial_services,2,1000,2,1,0,1
87335,6,financial_services,2,50,2,1,0,2


In [None]:
X_valid.head()

Unnamed: 0,ProductId,ProductCategory,ChannelId,Value,PricingStrategy,Credit,Debit,StartMonth
69075,3,airtime,3,2000,4,0,1,1
65709,10,airtime,3,1110,2,0,1,1
5430,10,airtime,3,2000,2,0,1,11
82375,6,financial_services,2,1000,2,1,0,2
60896,3,airtime,3,1000,2,0,1,1


At this point, data has been formated but the ProductCategory still needs one hot encoding to be interpreted by a model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, SMOTENC

In [None]:
X_train.head()

Unnamed: 0,ProductId,ProductCategory,ChannelId,Value,PricingStrategy,Credit,Debit,StartMonth
66339,6,financial_services,2,50,2,1,0,1
87279,3,airtime,3,1000,4,0,1,2
40582,21,utility_bill,3,16650,2,0,1,12
58655,6,financial_services,2,1000,2,1,0,1
87335,6,financial_services,2,50,2,1,0,2


# Using only regular SMOTE : working only with numerical features

In [None]:
X_train = X_train.drop('ProductCategory',axis=1)
X_valid = X_valid.drop('ProductCategory',axis=1)

In [None]:
X_valid.head()

Unnamed: 0,ProductId,ChannelId,Value,PricingStrategy,Credit,Debit,StartMonth
69075,3,3,2000,4,0,1,1
65709,10,3,1110,2,0,1,1
5430,10,3,2000,2,0,1,11
82375,6,2,1000,2,1,0,2
60896,3,3,1000,2,0,1,1


In [None]:
sm = SMOTE(random_state=27)

X_train, y_train = sm.fit_resample(X_train, y_train)
X_valid, y_valid = sm.fit_resample(X_valid, y_valid)



In [None]:

model = RandomForestClassifier()
forest_pipeline = Pipeline(steps=[
                            # ('preprocessor', preprocessor),
                            ('model',model)
                            ])
forest_pipeline.fit(X_train,y_train)
forest_preds = forest_pipeline.predict(X_valid)

In [None]:
compute_imbalanced_scores(forest_preds)

' Accuracy score: 0.9763326002722799 % \n F1 score: 0.9758198256031669 % \n Recall score: 0.9551261912242119 '

## preprocess also test data

In [None]:
# print(test_data.columns)
# test_data.head()
for cname in test_data.columns:
    if cname in underscore_columns:
        number = test_data[cname].str.split('_').str[1]
        test_data[cname] =  number


test_data[['Credit', 'Debit']] = test_data['Amount'].apply(lambda x: create_credit_debit_cols(x))


In [None]:
test_data[["StartDate", "StartTime"]] = test_data["TransactionStartTime"].str.split("T", expand=True)     
  
test_data["StartMonth"] = test_data["StartDate"].str.split("Z").str[0].str.split('-').str[1]



test_data = test_data.drop('TransactionStartTime',axis=1)



In [None]:
# not used cols :StartTime,Amount,CurrencyCode,ProductCategory
test_data = test_data.drop(useless_columns,axis=1)
# here since we are trying out regular SMOTE, we need to drop our categorical column : ProdctCategory 
test_data = test_data.drop('ProductCategory',axis=1)





In [None]:
test_data.head()

Unnamed: 0,ProductId,ChannelId,Value,PricingStrategy,Credit,Debit,StartMonth
0,3,3,1000,4,0,1,2
1,15,3,2000,2,0,1,2
2,6,2,50,2,1,0,2
3,10,3,3000,4,0,1,2
4,6,2,60,2,1,0,2


In [None]:
# Submission
transaction_ids = test_data['TransactionId']
# test_data = test_data[numerical_cols]
test_data.head()

real_preds = forest_pipeline.predict(test_data)


preds_df =  pd.DataFrame(
    {'TransactionId': transaction_ids.map(lambda id: "TransactionId_"+id),'FraudResult':real_preds}
)
preds_df
my_submission = preds_df.to_csv("my_submission.csv",index=False)

KeyError: 'TransactionId'

### Scores


53 % :
    <li>preprocessing : remove underscores and prefixes, adding two features credit/debit but dropped some columns : ProductCategory, TransactionStartTime,CurrencyCode and Amount
    <li>SMOTE for oversampling
    <li>RandomForestClassifier (100 estimators) model

# Need to integrate one hot