In [1]:
import numpy as np    # linear algebra
import pandas as pd   # data processing


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
pd.set_option('display.max_columns', None)   #to show all columns

# Getting the data

In [3]:
df = pd.read_csv('/kaggle/input/data-science-circle-challenge/train.csv')

In [4]:
df.shape

In [5]:
df['isFraud'].value_counts() / len(df)       #imbalance

In [6]:
df.isna().sum()

In [7]:
# Dropping columns that have nan values more than 30%
df = df.loc[:,df.isna().mean() < 0.3]

In [8]:
df.head()

### Turning the timestamp into days, months and years

In [9]:
import datetime

Trans_date = df['TransactionDT'].map(datetime.date.fromtimestamp)

In [10]:
months = [i.month for i in Trans_date]
days = [i.day for i in Trans_date]

In [11]:
# all years are the same, so we won't add this to the data
pd.Series([i.year for i in Trans_date]).value_counts()

In [12]:
df.insert(2, 'Months', months)
df.insert(2, 'Days', days)

In [13]:
df.drop(columns=['TransactionID'], inplace=True)

In [14]:
df.head()

# Now let's divide the data so we can deal with it easily

In [15]:
cat_cols = ['ProductCD','card4','card6','P_emaildomain','M6']     #the categorical columns 

In [16]:
df_num = df.drop(columns=cat_cols)

In [17]:
from sklearn.impute import SimpleImputer    #to deal with nan values 

In [18]:
imp_median = SimpleImputer(strategy='median')
imp_mode = SimpleImputer(strategy='most_frequent')

In [19]:
df_num.shape

In [20]:
df_num_imputed = imp_median.fit_transform(df_num)   #filling with median

In [21]:
df_num_imputed = pd.DataFrame(df_num_imputed, columns=df_num.columns, index=df_num.index)          #to_pandas

## Begin with categoricals

In [22]:
df_cat = df[cat_cols]

In [23]:
for col in df_cat.columns:
    df_cat[col] = df_cat.loc[:,col].astype('category')

In [24]:
df_cat.P_emaildomain.value_counts()

In [25]:
#converting the gmail to .com as usual
df_cat.P_emaildomain = df_cat.P_emaildomain.replace('gmail','gmail.com')

In [26]:
# Converting strange domains to "others"  (domains that appears less than 200 times)

other_domains = list(df_cat.P_emaildomain.value_counts()[df_cat.P_emaildomain.value_counts() < 200].index)

In [27]:
df_cat.P_emaildomain = df_cat.P_emaildomain.replace(other_domains,'others')

In [28]:
df_cat_impute = imp_mode.fit_transform(df_cat)  #filling with mode

In [29]:
df_cat_imputed = pd.DataFrame(df_cat_impute,columns=df_cat.columns,index=df_cat.index)     #to pandas

In [30]:
df_cat_imputed.shape

### Making label encoder for each categorical column to use it in the test data

In [31]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
le4 = LabelEncoder()
le5 = LabelEncoder()

In [32]:
for le, col in zip([le1, le2, le3, le4, le5], df_cat_imputed.columns):
    df_cat_imputed[col] = pd.Series(le.fit_transform(df_cat_imputed.loc[:,col]))    #encoding the columns

In [33]:
# Mapping each label encoder to its specific column
mapping_le = dict(zip(df_cat_imputed.columns, [le1, le2, le3, le4, le5]))
mapping_le

In [34]:
#our clean dataset
visa = pd.concat([df_cat_imputed, df_num_imputed],axis=1)  

# Preparing the training data

In [35]:
from cuml.model_selection import train_test_split  

In [36]:
import cudf
visa = cudf.DataFrame(visa, columns=visa.columns)    #Converting to cudf dataframe for faster processing

In [37]:
visa = visa.astype('float32')

### Removing the outliers

In [38]:
q1 = visa[visa.isFraud==0].quantile(0.05)
q2 = visa[visa.isFraud==0].quantile(0.95)
iqr = q2-q1
mini = q1-1.5*iqr
maxi = q2+1.5*iqr
con = (visa[visa.isFraud==0]<=maxi)&(visa[visa.isFraud==0]>=mini)
con = con.all(axis=1)
visa_iqr = visa[visa.isFraud==0][con]  

In [39]:
visa_new = cudf.concat([visa_iqr,visa[visa.isFraud==1]])   

In [40]:
# Using stratify with Y because it's imbalancing data
x_train, x_test, y_train, y_test = train_test_split(visa_new.drop(columns='isFraud'), visa_new['isFraud'],
                                                    random_state=42,
                                                    test_size = 0.02, stratify=visa_new['isFraud'])

In [41]:
y_train.value_counts() / len(y_train)

# Modeling

In [42]:
from cuml.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [43]:
from cuml.metrics import roc_auc_score

In [44]:
forest_clf.fit(x_train, y_train)
y_pred = forest_clf.predict(x_test)
roc_auc_score(y_test, y_pred)

In [45]:
import xgboost

In [46]:
x_clf = xgboost.XGBClassifier(tree_method = 'gpu_hist')

In [47]:
x_train.shape

In [48]:
len(cudf.Series(np.random.rand(x_train.shape[0])).astype('float32'))

### Inserting a random column to help with selecting the most important features

In [49]:
x_train.insert(0,'random',cudf.Series(np.random.rand(x_test.shape[0])).astype('float32'))

In [50]:
x_clf.fit(x_train, y_train)

In [51]:
sorted(list(zip(x_clf.feature_importances_, x_train.columns)), reverse=True)

In [57]:
fm = sorted(list(zip(x_clf.feature_importances_, x_train.columns)), reverse=True)
bad_columns = []
for i in fm[133:]:
    bad_columns.append(i[1])

### Upsampling the data

In [59]:
from imblearn.over_sampling import SMOTE      
sm = SMOTE(random_state=42)
x_train_res, y_train_res = sm.fit_resample(x_train.drop(columns=bad_columns).as_matrix(), y_train.to_array())

In [60]:
x_clf.fit(x_train_res, y_train_res)
y_pred = x_clf.predict(x_test.drop(columns=bad_columns[1:]))
roc_auc_score(y_test, y_pred)

# Now we get ready to submit

In [61]:
#The data we want to predict on
test = pd.read_csv('/kaggle/input/data-science-circle-challenge/test.csv')

In [62]:
ID = test['TransactionID']    #The first column of submission

### Preparing the test data

In [63]:
Trans_date = test['TransactionDT'].map(datetime.date.fromtimestamp)

months = [i.month for i in Trans_date]
days = [i.day for i in Trans_date]
test.insert(2, 'Months', months)
test.insert(2, 'Days', days)

In [64]:
valid_col = x_test.drop(columns=bad_columns[1:]).columns

In [65]:
test = test.loc[:,valid_col]

In [66]:
test.head()

In [67]:
test['P_emaildomain'] = ['others' if i in other_domains else i for i in test['P_emaildomain']]
test['P_emaildomain'] = test['P_emaildomain'].replace('gmail','gmail.com')

In [68]:
imp_median.fit(cudf.DataFrame(x_train_res).iloc[:,5:].as_matrix())

In [69]:
# Encoding the categorical columns
test.iloc[:,:5] = test.iloc[:,:5].astype('category')
test.iloc[:,:5] = imp_mode.transform(test.iloc[:,:5])
    
for col in test.iloc[:,:5].columns:
    test[col] = mapping_le[col].transform(test[col])

test.iloc[:,5:] = imp_median.transform(test.iloc[:,5:])

In [70]:
y_proba = x_clf.predict_proba(test)

In [71]:
y_f = pd.DataFrame(y_proba)

In [72]:
final = pd.concat([ID,y_f[1]],axis=1)
final = final.rename(columns={1: "isFraud"})

In [73]:
final.info()

In [74]:
final.to_csv('f.csv',index=False)