In [2]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import gc
from datetime import datetime 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
pd.set_option('display.max_columns', 100)


RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier


#TRAIN/VALIDATION/TEST SPLIT
#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split
TEST_SIZE = 0.20 # test size using_train_test_split

#CROSS-VALIDATION
NUMBER_KFOLDS = 5 #number of KFolds for cross-validation



RANDOM_STATE = 2018
MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop 
OPT_ROUNDS = 1000  #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result


ModuleNotFoundError: No module named 'lightgbm'

# data understanding
About this file
Training set for Credit Card Transactions
1.index - Unique Identifier for each row
2.transdatetrans_time - Transaction DateTime
3.cc_num - Credit Card Number of Customer
4.merchant - Merchant Name
5.category - Category of Merchant
6.amt - Amount of Transaction
7.first - First Name of Credit Card Holder
8.last - Last Name of Credit Card Holder
9.gender - Gender of Credit Card Holder
10.street - Street Address of Credit Card Holder
11.city - City of Credit Card Holder
12.state - State of Credit Card Holder
13.zip - Zip of Credit Card Holder
14.lat - Latitude Location of Credit Card Holder
15.long - Longitude Location of Credit Card Holder
16.city_pop - Credit Card Holder's City Population
17.job - Job of Credit Card Holder
18.dob - Date of Birth of Credit Card Holder
19.trans_num - Transaction Number
20.unix_time - UNIX Time of transaction
21.merch_lat - Latitude Location of Merchant
22.merch_long - Longitude Location of Merchant
23.is_fraud - Fraud Flag <--- Target Class

In [None]:
df=pd.read_csv('fraudTest.csv',index_col=0)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df['is_fraud'].value_counts()

In [None]:
df1=pd.read_csv('fraudTrain.csv',index_col=0)

In [None]:
df1

# data dictionary

In [None]:
df.info()

In [None]:
df1.shape

In [None]:
df1.shape

# Variable Categorization

In [None]:
df1_num=df1.select_dtypes(include=[np.number])

In [None]:
df1_num

In [None]:
df1_cat=df1.select_dtypes(exclude=[np.number])

In [None]:
df1_cat

# count of missing/ null values, redundant columns

In [None]:
df1.isnull().sum()

In [None]:
df1.head(2)

# Data Exploration

# relationship between variables

In [None]:
df1.corr()

In [None]:
plt.rcParams['figure.figsize']=(20,20)

In [None]:
sns.heatmap(df1.corr(),annot=True)

# 1.Multi collinearity

In [None]:
df1_n=df1_num.drop(['cc_num','is_fraud'],axis=1)
df1_n

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
X=df1_n
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["Features"] = X.columns
print(vif)

In [None]:
vif.sort_values('VIF_Factor',ascending=False)

# 2. Distribution of variables
    

In [None]:
df1_num.drop(['cc_num','is_fraud'],axis=1).hist()

# 3.Presence of outliers and its treatment

In [None]:
df1_n.columns

In [None]:
heart_CAT = ['amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat',
       'merch_long']

a = 2  
b = 4  
c = 1  

fig = plt.figure(figsize=(14,10))

for i in heart_CAT:
    plt.subplot(a, b, c)
    plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
    plt.xlabel(i)
    sns.boxplot(df1_n[i],color='red')
    c = c + 1

plt.show()

# univariate analysis for numerical columns

In [None]:
fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
plt.title('credit card holder latitude box plot')
sns.kdeplot(data=df1_n['lat'])

#  subplot #2
plt.subplot(122)
plt.title('credit card holder latitude violin plot')
sns.violinplot(data=df1_n['lat'])

plt.show()

In [None]:
fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
plt.title('credit card holder longitude box plot')
sns.kdeplot(data=df1_n['long'])

#  subplot #2
plt.subplot(122)
plt.title('credit card holder longitude violin plot')
sns.violinplot(data=df1_n['long'])

plt.show()

In [None]:
fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
plt.title('Transaction Amount boxplot')
sns.kdeplot(data=df1_n['amt'])

#  subplot #2
plt.subplot(122)
plt.title('transaction amount violin plot')
sns.violinplot(data=df1_n['lat'])

plt.show()

In [None]:
fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
plt.title('merchant latitude box plot')
sns.kdeplot(data=df1_n['merch_lat'])

#  subplot #2
plt.subplot(122)
plt.title('merchant latitude violin plot')
sns.violinplot(data=df1_n['merch_lat'])

plt.show()

In [None]:
fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
plt.title('merchant longitude box plot')
sns.kdeplot(data=df1_n['merch_long'])

#  subplot #2
plt.subplot(122)
plt.title('merchant longitude violin plot')
sns.violinplot(data=df1_n['merch_long'])

plt.show()

In [None]:
fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
plt.title('unix time box plot')
sns.kdeplot(data=df1_n['unix_time'])

#  subplot #2
plt.subplot(122)
plt.title('unix time violin plot')
sns.violinplot(data=df1_n['unix_time'])

plt.show()

In [None]:
fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
plt.title('city population box plot')
sns.kdeplot(data=df1_n['city_pop'])

#  subplot #2
plt.subplot(122)
plt.title('city population violin plot')
sns.violinplot(data=df1_n['city_pop'])

plt.show()

In [None]:
df1_n.columns

# univariate analysis for categorical columns

# top 10 merchants out of 800

In [None]:
df1_cat['merchant'].value_counts().head(10)

In [None]:
temp = df1["merchant"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'Merchant Category univariate analysis',
          xaxis = dict(title = 'merchant', showticklabels=True), 
          yaxis = dict(title = 'count'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# category of merchants

In [None]:
df1_cat['category'].value_counts().head()

In [None]:
temp = df1["category"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'category univariate analysis',
          xaxis = dict(title = 'category', showticklabels=True), 
          yaxis = dict(title = 'count'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# top 10 cities of credit card holder

In [None]:
df1_cat['city'].value_counts().head(10)

In [None]:
temp = df1["city"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'city univariate analysis',
          xaxis = dict(title = 'city', showticklabels=True), 
          yaxis = dict(title = 'count'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# top 10 state of credit card holder

In [None]:
df1_cat['state'].value_counts().head(10)

In [None]:
temp = df1["state"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'state univariate analysis',
          xaxis = dict(title = 'state', showticklabels=True), 
          yaxis = dict(title = 'count'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# top 10 jobs of credit card holder

In [None]:
df1_cat['job'].value_counts().head(50)

In [None]:
temp = df1["job"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'job univariate analysis',
          xaxis = dict(title = 'job', showticklabels=True), 
          yaxis = dict(title = 'count'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# statistical insights

In [None]:
df1.describe()

In [None]:
df1.std()

# checking the data imbalance ratio

In [None]:
target0 = df1.loc[df1['is_fraud']==0]
target1 = df1.loc[df1['is_fraud']==1]

In [None]:
df1['is_fraud'].value_counts()
(df1['is_fraud'].value_counts()/len(df1['is_fraud']))*100

In [None]:
round(len(target0)/len(target1),2)

In [None]:
m=df1['is_fraud'].value_counts()

In [None]:
import plotly.express as px
import numpy
 
# Random Data
random_x = m.values
names = m.index
 
fig = px.pie(values=random_x, names=names,title='Credit card transaction -data imbalance')
fig.show()

In [None]:
temp = df1["is_fraud"].value_counts()
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'Credit Card Fraud Class - data unbalance (Notfraud= 0,Fraud =1)',
          xaxis = dict(title = 'Class', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# inference:
    df1 dataframe that is application data is highly imbalanced .
    'defaulted population is 0.57% and non defaulted population ois 99.4%'

we will seperately analyse the data based in the target variable for a better understanding

# bivariate analysis 

# gender percenatge

In [None]:
df_gender=(df1['gender'].value_counts()/len(df1['gender']))*100

In [None]:
temp = df1["gender"].value_counts()
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Transaction - Gender Wise Analysis",
    marker=dict(color="Green"),
    text=df['values']
)
data = [trace]
layout = dict(title = "Credit Card Transaction - Gender Wise Analysis",
          xaxis = dict(title = 'Gender', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

In [None]:
pd.crosstab(target0['gender'],target0['is_fraud'])

In [None]:
target0['gender'].value_counts()

In [None]:
target0.groupby("gender")["is_fraud"].count()

In [None]:
(target0.groupby("gender")["is_fraud"].count())/(len(df1['is_fraud']))*100

In [None]:
(target1.groupby("gender")["is_fraud"].count())/(len(df1['is_fraud']))*100

In [None]:
plt.rcParams['figure.figsize']=(15,5)

In [None]:
plt.subplot(121)
sns.countplot(x='is_fraud',hue='gender',data=target0,palette='Set2')
plt.title('Gender distribution in not fraud transactions')
plt.ylabel('Not fraud transactions')
plt.subplot(122)
sns.countplot(x='is_fraud',hue='gender',data=target1,palette='rocket')
plt.title('Gender distribution in fraud transactions')
plt.ylabel('Fraud transactions')

# insights 

*it seems like female clients done more transaction than male customers
*54.45% female clients are non defaulters while 44.96% male clients are non defaulters
*0.28% female clients are defaulters and 0.29% male clients are defaulters

# binning dob 

In [None]:
import datetime

In [None]:
df1['dob']=pd.to_datetime(df1['dob'])

In [None]:
df.head(2)

In [None]:
df['dob']=pd.to_datetime(df['dob'])

In [None]:
df1['dob']

In [None]:
df1['year']=pd.DatetimeIndex(df1['dob']).year

In [None]:
df['year']=pd.DatetimeIndex(df['dob']).year

In [None]:
df1['year']=2022-df1['year']

In [None]:
df['year']=2022-df['year']

In [None]:
df1.info()

In [None]:
df1['year'].unique()

In [None]:
def function_mrp(a):
    if a <=25 :
        return 'Very young age'
    if a > 25 and a <= 35:
        return 'Young age'
    if a > 35 and a <= 45:
        return 'Middle age'
    else:
        return 'Senior citizen'

In [None]:
df1['age_group'] = df1['year'].apply(function_mrp)

In [None]:
df['age_group'] = df['year'].apply(function_mrp)

In [None]:
df.head(2)

In [None]:
target0 = df1.loc[df1['is_fraud']==0]
target1 = df1.loc[df1['is_fraud']==1]

In [None]:
df1

In [None]:
target0

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(121)
sns.countplot(x='is_fraud',hue='age_group',data=target0,palette='Set2')
plt.title('Age group vs not fraud transactions')
plt.ylabel('Not fraud transactions')
plt.subplot(122)
sns.countplot(x='is_fraud',hue='age_group',data=target1,palette='Set2')
plt.title('Age group vs Fraud transactions')
plt.ylabel('Fraud transactions')
plt.show()

Insights
*senior citizen above 60 are higher than any other in case of defaulters as well as non defaulters
*also senior citizens age group facing paying difficulties are the most
*while middle age group and very young age group facing less difficulties in paying

# job distribution based on target0 and target1

In [None]:
temp = target0["job"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Transaction - job Wise Analysis target0",
    marker=dict(color="blue"),
    text=df['values']
)
data = [trace]
layout = dict(title ="Credit Card Transaction - job Wise Analysis target0",
          xaxis = dict(title = 'job', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

#inference
1.jobs such as film editor , agriculture consultant, financial trader are quitely the non defaulters in credit card transaction

In [None]:
temp = target1["job"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Transaction - job Wise Analysis target1",
    marker=dict(color="red"),
    text=df['values']
)
data = [trace]
layout = dict(title ="job category wise analysis  for fraud transaction",
          xaxis = dict(title = 'job categories', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# inference
1.jobs such as materials engineer , podiatrist, energy engineer are quitely the defaulters in credit card transaction

# merchant category vs fraud or not fraud

In [None]:
df_category=pd.crosstab(df1['category'],df1['is_fraud'])

In [None]:
df_c=df_category.reset_index()

In [None]:
df_c.rename(columns={'is_fraud':'index',0:'not_fraud',1:'fraud'},inplace=True)

In [None]:
df_c=df_c.sort_values('fraud',ascending=False)

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('Merchant category vs not fraud counts',"Merchant  category vs fraud counts"))

fig.add_trace(go.Bar(x=df_c['category'], y=df_c['not_fraud'],
                    marker=dict(color=df_c['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_c['category'], y=df_c['fraud'],
                    marker=dict( color=df_c['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='merchant categories'
fig['layout']['xaxis2']['title']='merchant categories'
fig['layout']['yaxis']['title']='not fraud counts'
fig['layout']['yaxis2']['title']='fraud counts'

fig.update_layout(coloraxis=dict(colorscale='viridis'), showlegend=False)
fig.show()

# insights 
*merchant categories such as gas_transport,home,grocery_pos, shopping_pos has high non fraud transactions
*merchant categories such as grocery_pos,shopping_net,misc_net, shopping_pos has fraud transactions

# cities of credit card holder vs fraud or not fraud counts 

In [None]:
df1['city'].value_counts().head(10)

In [None]:
df_city=pd.crosstab(df1['city'],df1['is_fraud'])

In [None]:
df_city.reset_index(inplace=True)

In [None]:
df_city.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)

In [None]:
df_city=df_city.sort_values('fraud',ascending=False)

In [None]:
df_city = df_city.head(10)

In [None]:
df_city

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('cities of credit card holder vs not fraud counts',"cities of credit card holder vs fraud counts"))

fig.add_trace(go.Bar(x=df_city['city'], y=df_city['not_fraud'],
                    marker=dict(color=df_city['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_city['city'], y=df_city['fraud'],
                    marker=dict( color=df_city['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='cities of credit card holder'
fig['layout']['xaxis2']['title']='cities of credit card holder'
fig['layout']['yaxis']['title']='not fraud counts'
fig['layout']['yaxis2']['title']='fraud counts'

fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)
fig.show()

# insights:
1.we can clearly see that san Antonio city has the highest non fraud transaction where as huston city has the highest fraud transaction

In [None]:
temp = target0["city"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name='cities of credit card holder vs not fraud counts',
    marker=dict(color="green"),
    text=df['values']
)
data = [trace]
layout = dict(title ='cities of credit card holder vs not fraud counts',
          xaxis = dict(title = 'city', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

In [None]:
temp = target1["city"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name='cities of credit card holder vs  fraud counts',
    marker=dict(color="red"),
    text=df['values']
)
data = [trace]
layout = dict(title ='cities of credit card holder vs fraud counts',
          xaxis = dict(title = 'city', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# top 10 merchants vs fraud transactions and not fraud transactions

In [None]:
plt.rcParams['figure.figsize']=(15,8)

In [None]:
temp = target0["merchant"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name=' top 10 merchants vs not fraud transactions',
    marker=dict(color="green"),
    text=df['values']
)
data = [trace]
layout = dict(title ='top 10 merchants vs not fraud transactions',
          xaxis = dict(title = 'merchant', showticklabels=True), 
          yaxis = dict(title = 'Number of non fraud transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

In [None]:
temp = target1["merchant"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name=' top 10 merchants vs  fraud transactions',
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title ='top 10 merchants vs  fraud transactions',
          xaxis = dict(title = 'merchant', showticklabels=True), 
          yaxis = dict(title = 'Number of fraud transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# insights 

*merchants like kilback llc, schumm plc , cormier llc have high non fraud transcations 
*mercahnts like rau and sons , kozey boehm , cormier llc have high fraud transactions 

# top 10 states vs fraud transactions and not fraud transactions

In [None]:
temp = target0["state"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name=' top 10 merchants vs not fraud transactions',
    marker=dict(color="green"),
    text=df['values']
)
data = [trace]
layout = dict(title ='top 10 states vs not fraud transactions',
          xaxis = dict(title = 'merchant', showticklabels=True), 
          yaxis = dict(title = 'Number of non fraud transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

In [None]:
temp = target1["state"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name=' top 10 merchants vs  fraud transactions',
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title ='Top 10 states vs  Fraud transactions',
          xaxis = dict(title = 'States', showticklabels=True), 
          yaxis = dict(title = 'Number of fraud transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# insights 

*state tx,ny,pa has high fraud transactions as well as high non fraud transactions 

# amt vs non fraud transaction distribution and amt vs fraud transaction distribution

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(40,20))
    
plt.subplot(1,2,1)                                   
sns.distplot(target0['amt'], color="g" )
plt.yscale('linear') 
plt.xlabel('amt', fontsize= 30, fontweight="bold")
plt.ylabel('Non fraud transactions', fontsize= 30, fontweight="bold")                    #Target 0
plt.xticks(rotation=90, fontsize=30)
plt.yticks(rotation=360, fontsize=30)
     
    
    
    
plt.subplot(1,2,2)                                                                                                      
sns.distplot(target1['amt'], color="r")
plt.yscale('linear')    
plt.xlabel('amt', fontsize= 30, fontweight="bold")
plt.ylabel('Fraud Transcations', fontsize= 30, fontweight="bold")                       # Target 1
plt.xticks(rotation=90, fontsize=30)
plt.yticks(rotation=360, fontsize=30)
    
plt.show();

In [None]:
target0['amt'].std()

In [None]:
target1['amt'].std()

In [None]:
target0['amt'].skew() #highle positive skewed  

# insight

Dist. plot highlights the curve shape which is wider for Target 1 in comparison to Target 0 which is narrower with well-defined edges.

In [None]:
df1['trans_date_trans_time']

In [None]:
df1['trans_date_trans_time']=pd.to_datetime(df1['trans_date_trans_time'])

In [None]:
df['trans_date_trans_time']=pd.to_datetime(df['trans_date_trans_time'])

In [None]:
df1['weekday_no'] = df1['trans_date_trans_time'].dt.dayofweek
df1['week_day'] = df1['trans_date_trans_time'].dt.day_name()
df1['week_no'] = df1['trans_date_trans_time'].dt.week
df1['day_no'] = df1['trans_date_trans_time'].dt.day
df1['min_day'] = df1['trans_date_trans_time'].dt.minute
df1['hr_day'] = df1['trans_date_trans_time'].dt.hour
df1['month_name'] = df1['trans_date_trans_time'].dt.month_name()
df1['month'] = df1['trans_date_trans_time'].dt.month
df1['year'] = df1['trans_date_trans_time'].dt.year
df1['year_dayno'] = df1['trans_date_trans_time'].dt.dayofyear

In [None]:
df['weekday_no'] = df['trans_date_trans_time'].dt.dayofweek
df['week_day'] = df['trans_date_trans_time'].dt.day_name()
df['week_no'] = df['trans_date_trans_time'].dt.week
df['day_no'] = df['trans_date_trans_time'].dt.day
df['min_day'] = df['trans_date_trans_time'].dt.minute
df['hr_day'] = df['trans_date_trans_time'].dt.hour
df['month_name'] = df['trans_date_trans_time'].dt.month_name()
df['month'] = df['trans_date_trans_time'].dt.month
df['year'] = df['trans_date_trans_time'].dt.year
df['year_dayno'] = df['trans_date_trans_time'].dt.dayofyear

In [None]:
df.head(2)

In [None]:
df1

In [None]:
df1['week_day'].value_counts()

In [None]:
df1['month_name'].value_counts().sort_values(ascending=False)

In [None]:
temp = df1["week_day"].value_counts().head(10)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name=' week days vs  no of transactions',
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title ='week days vs no of transactions',
          xaxis = dict(title = 'weekdays', showticklabels=True), 
          yaxis = dict(title = 'Number of fraud transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

In [None]:
temp = df1['month_name'].value_counts().sort_values(ascending=False)
df = pd.DataFrame({'Class': temp.index,'values': temp.values})

trace = go.Bar(
    x = df['Class'],y = df['values'],
    name=' month vs  no of transactions',
    marker=dict(color="Red"),
    text=df['values']
)
data = [trace]
layout = dict(title ='month vs no of transactions',
          xaxis = dict(title = '', showticklabels=True), 
          yaxis = dict(title = 'Number of fraud transactions'),
          hovermode = 'closest',width=600
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

# Bivariate Analysis : Numerical and Categorical wrt target variables

In [None]:
df1

# merchant vs amount transaction vs target variable

In [None]:
df_merch_amt=df1.pivot_table(index='merchant',columns='is_fraud',values='amt',aggfunc='sum')

In [None]:
df_merch_amt.reset_index(inplace=True)

In [None]:
df_merch_amt.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)

In [None]:
df_merch_amt=df_merch_amt.head(10)

In [None]:
df_merch_amt

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('merchant non-fraud transaction amount',"merchant fraud transaction amount"))

fig.add_trace(go.Bar(x=df_merch_amt['merchant'], y=df_merch_amt['not_fraud'],
                    marker=dict(color=df_merch_amt['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_merch_amt['merchant'], y=df_merch_amt['fraud'],
                    marker=dict( color=df_merch_amt['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='merchant'
fig['layout']['xaxis2']['title']='merchant'
fig['layout']['yaxis']['title']='not fraud transaction amount'
fig['layout']['yaxis2']['title']='fraud transaction amount'

fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)
fig.show()

# job vs amount transaction vs target variable

In [None]:
df_job_amt=df1.pivot_table(index='job',columns='is_fraud',values='amt',aggfunc='sum')
df_job_amt.reset_index(inplace=True)
df_job_amt.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)
df_job_amt=df_job_amt.sort_values('fraud',ascending=False).head(10)
df_job_amt

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('job non-fraud transaction amount',"job fraud transaction amount"))

fig.add_trace(go.Bar(x=df_job_amt['job'], y=df_job_amt['not_fraud'],
                    marker=dict(color=df_job_amt['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_job_amt['job'], y=df_job_amt['fraud'],
                    marker=dict( color=df_job_amt['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='jobs'
fig['layout']['xaxis2']['title']='jobs'
fig['layout']['yaxis']['title']='not fraud transaction amount'
fig['layout']['yaxis2']['title']='fraud transaction amount'

fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)
fig.show()

# month vs amount transaction vs target variable

In [None]:
df1['month_name'].value_counts()

In [None]:
df_month_amt=df1.pivot_table(index='month_name',columns='is_fraud',values='amt',aggfunc='sum')
df_month_amt.reset_index(inplace=True)
df_month_amt.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)
df_month_amt=df_month_amt.head(12)
df_month_amt

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('month wise non-fraud transaction amount',"month wise fraud transaction amount"))

fig.add_trace(go.Bar(x=df_month_amt['month_name'], y=df_month_amt['not_fraud'],
                    marker=dict(color=df_month_amt['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_month_amt['month_name'], y=df_month_amt['fraud'],
                    marker=dict( color=df_month_amt['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='months'
fig['layout']['xaxis2']['title']='months'
fig['layout']['yaxis']['title']='not fraud transaction amount'
fig['layout']['yaxis2']['title']='fraud transaction amount'

fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)
fig.show()

# Merchant Category vs amt transaction vs target variable 

In [None]:
df_mercat_amt=df1.pivot_table(index='category',columns='is_fraud',values='amt',aggfunc='sum')
df_mercat_amt.reset_index(inplace=True)
df_mercat_amt.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)
df_mercat_amt=df_mercat_amt.sort_values('fraud',ascending=False).head(10)
df_mercat_amt

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('merchant category wise non-fraud transaction amount',"merchant category wise fraud transaction amount"))

fig.add_trace(go.Bar(x=df_mercat_amt['category'], y=df_mercat_amt['not_fraud'],
                    marker=dict(color=df_mercat_amt['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_mercat_amt['category'], y=df_mercat_amt['fraud'],
                    marker=dict( color=df_mercat_amt['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='merchant category'
fig['layout']['xaxis2']['title']='merchant category'
fig['layout']['yaxis']['title']='not fraud transaction amount'
fig['layout']['yaxis2']['title']='fraud transaction amount'

fig.update_layout(coloraxis=dict(colorscale='viridis'), showlegend=False)
fig.show()

# state vs amt transaction vs target variable 

In [None]:
df_state_amt=df1.pivot_table(index='state',columns='is_fraud',values='amt',aggfunc='sum')
df_state_amt.reset_index(inplace=True)
df_state_amt.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)
df_state_amt=df_state_amt.sort_values('fraud',ascending=False).head(10)
df_state_amt

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('State wise non-fraud transaction amount',"State wise fraud transaction amount"))

fig.add_trace(go.Bar(x=df_state_amt['state'], y=df_state_amt['not_fraud'],
                    marker=dict(color=df_state_amt['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_state_amt['state'], y=df_state_amt['fraud'],
                    marker=dict( color=df_state_amt['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='State'
fig['layout']['xaxis2']['title']='State'
fig['layout']['yaxis']['title']='not fraud transaction amount'
fig['layout']['yaxis2']['title']='fraud transaction amount'

fig.update_layout(coloraxis=dict(colorscale='thermal'), showlegend=False)
fig.show()

# city vs amt transaction vs target variable 

In [None]:
df_city_amt=df1.pivot_table(index='city',columns='is_fraud',values='amt',aggfunc='sum')
df_city_amt.reset_index(inplace=True)
df_city_amt.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)
df_city_amt=df_city_amt.sort_values('fraud',ascending=False).head(10)
df_city_amt

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('city wise non-fraud transaction amount',"city wise fraud transaction amount"))

fig.add_trace(go.Bar(x=df_city_amt['city'], y=df_city_amt['not_fraud'],
                    marker=dict(color=df_city_amt['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_city_amt['city'], y=df_city_amt['fraud'],
                    marker=dict( color=df_city_amt['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='city'
fig['layout']['xaxis2']['title']='city'
fig['layout']['yaxis']['title']='not fraud transaction amount'
fig['layout']['yaxis2']['title']='fraud transaction amount'

fig.update_layout(coloraxis=dict(colorscale='plasma'), showlegend=False)
fig.show()

# Gender vs amt transaction amount vs target variable

In [None]:
df_gender_amt=df1.pivot_table(index='gender',columns='is_fraud',values='amt',aggfunc='sum')
df_gender_amt.reset_index(inplace=True)
df_gender_amt.rename(columns={0:'not_fraud',1:'fraud'},inplace=True)
df_gender_amt=df_gender_amt.sort_values('fraud',ascending=False).head(10)
df_gender_amt

In [None]:
from plotly.subplots import make_subplots 
import plotly.graph_objects as go


fig = make_subplots(rows=1, cols=2, shared_yaxes=True,subplot_titles=('gender wise non-fraud transaction amount',"gender wise fraud transaction amount"))

fig.add_trace(go.Bar(x=df_gender_amt['gender'], y=df_gender_amt['not_fraud'],
                    marker=dict(color=df_gender_amt['not_fraud'], coloraxis="coloraxis")),
              1, 1)

fig.add_trace(go.Bar(x=df_gender_amt['gender'], y=df_gender_amt['fraud'],
                    marker=dict( color=df_gender_amt['fraud'],coloraxis="coloraxis")),
              1, 2)

fig['layout']['xaxis']['title']='gender'
fig['layout']['xaxis2']['title']='gender'
fig['layout']['yaxis']['title']='not fraud transaction amount'
fig['layout']['yaxis2']['title']='fraud transaction amount'

fig.update_layout(coloraxis=dict(colorscale='jet'), showlegend=False)
fig.show()

# month vs amount vs target variable trend graph

In [None]:
df1.head(2)

In [None]:
df1.pivot_table(index=['year','month_name'],columns=['is_fraud'],values='amt',aggfunc='sum').plot(kind='line')

In [None]:
df_trend=df1.pivot_table(index=['year','month_name','month'],columns=['is_fraud'],values='amt',aggfunc='sum')

In [None]:
df_trend.reset_index(inplace=True)

In [None]:
df_trend.rename(columns={0:'not_fraud transaction amount',1:'fraud transaction amount'},inplace=True)

In [None]:
df_trend

In [None]:
df_trend=df_trend.sort_values('month')


In [None]:
import plotly.express as px

df = px.data.gapminder().query("continent=='Oceania'")
fig = px.line(df_trend, x="month_name", y='fraud transaction amount', color='year')

fig.show()

# insight 
 from the dataset ,
 2019 fraud transaction amount got its peak during december month and least at june month
 2020 fraud transaction amount got its peak during may month and least during the april month  


# statistical Significance

In [None]:
df1.head()

# dropping few columns which will not help in analysis

# for categorical columns with sub category more than 2 and categorical variable equal to 2 we use chi square test

In [None]:
df2=df1.drop(['trans_date_trans_time','cc_num','first','last','street','zip','dob','trans_num','unix_time','week_day','month_name'],axis=1)

In [None]:
df2.shape

In [None]:
df2.head()

#our target variable is is_fraud categorical variable

In [None]:
categorical columns
1.merchant
2.category
3.gender
4.city
5.state
6.job
7.year
8.age_group
9.weekday_no
10.week_no
11.day no
12.month

In [None]:
from scipy import stats

# merchant vs is_fraud

In [None]:
#null: merchant and is_fraud are independent
#alter:merchant and is_fraud are dependent

In [None]:
stats.chi2_contingency(pd.crosstab(df2['merchant'],df2['is_fraud']))

#pvalue is less than significance level we reject null hypothesis
concluding that merchant and is_fraud are dependent

# category vs is_fraud

In [None]:
#null: category and is_fraud are independent
#alter:category and is_fraud are dependent

In [None]:
stats.chi2_contingency(pd.crosstab(df2['category'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that category and is_fraud are dependent

# Gender vs is_fraud

In [None]:
#null: gender and is_fraud are independent
#alter:gender and is_fraud are dependent

In [None]:
stats.chi2_contingency(pd.crosstab(df2['gender'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that gender and is_fraud are dependent

# city vs is_fraud

In [None]:
#null: gender and is_fraud are independent
#alter:gender and is_fraud are dependent

In [None]:
stats.chi2_contingency(pd.crosstab(df2['city'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that city and is_fraud are dependent

# state vs is_fraud

In [None]:
#null: state and is_fraud are independent
#alter:state and is_fraud are dependent

In [None]:
stats.chi2_contingency(pd.crosstab(df2['state'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that state and is_fraud are dependent

# job vs is_fraud

In [None]:
#null: job and is_fraud are independent
#alter:job and is_fraud are dependent
stats.chi2_contingency(pd.crosstab(df2['job'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that job and is_fraud are dependent

# year vs is_fraud

In [None]:
#null: yEAR and is_fraud are independent
#alter:year and is_fraud are dependent
stats.chi2_contingency(pd.crosstab(df2['year'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that year and is_fraud are dependent

# age_group vs is_fraud

In [None]:
#null: age_group and is_fraud are independent
#alter:age_group and is_fraud are dependent
stats.chi2_contingency(pd.crosstab(df2['age_group'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that age_group and is_fraud are dependent

# weekday_no vs is_fraud

In [None]:
#null:weekday_no and is_fraud are independent
#alter:weekday_no and is_fraud are dependent
stats.chi2_contingency(pd.crosstab(df2['weekday_no'],df2['is_fraud']))


In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that weekday_no and is_fraud are dependent

# week_no vs is_fraud

In [None]:
#null:week_no and is_fraud are independent
#alter:week_no and is_fraud are dependent
stats.chi2_contingency(pd.crosstab(df2['week_no'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that week_no and is_fraud are dependent

# day_no vs is_fraud

In [None]:
#null:day_no and is_fraud are independent
#alter:day_no and is_fraud are dependent
stats.chi2_contingency(pd.crosstab(df2['day_no'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that day_no and is_fraud are dependent

# month vs is_fraud

In [None]:
#null:month  and is_fraud are independent
#alter:month  and is_fraud are dependent
stats.chi2_contingency(pd.crosstab(df2['month'],df2['is_fraud']))

In [None]:
#pvalue is less than significance level we reject null hypothesis
concluding that month and is_fraud are dependent

In [None]:
numerical columns
1.amt
2.lat
3.long
4.merch_lat
5.merch_long
6.city_pop
7.year_dayno
8.min_day
9.hr_day

# feature Engineering

# 1.-	Whether any transformations required

In [None]:
#yoe johnson transformations required for our dataset to reduce our skewness

# 2.dropping the redundant columns 

In [None]:
plt.figure(figsize = (30,30))
sns.heatmap(df1.corr(),annot = True, cmap="GnBu",fmt='.2f')
plt.show()

from correlation matrix we can see columns with 
high multi collinearity is marked with dark blue we can drop either one column based on vif score

In [None]:
1.zip
2.lat
3.long
4.unix time
5.month 
6.year_dayno
7.week_no

In [None]:
df1_no=df1.select_dtypes(include=[np.number])

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
X=df1_no
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["Features"] = X.columns
print(vif)

In [None]:
df_mb=df1.copy()

In [None]:
df_mb.head(2)

In [None]:
df_mb.drop(['trans_date_trans_time', 'cc_num', 'merchant', 'street', 'city', 'state','first','last',
          'zip', 'lat','long', 'job', 'trans_num', 'unix_time', 'month_name', 'year_dayno','dob','week_day','week_no'],1,inplace=True)

In [None]:
df.drop(['trans_date_trans_time', 'cc_num', 'merchant', 'street', 'city', 'state','first','last',
          'zip', 'lat','long', 'job', 'trans_num', 'unix_time', 'month_name', 'year_dayno','dob','week_day','week_no'],1,inplace=True)

# 3.scaling the data

In [None]:
df_target=df_mb['is_fraud']

In [None]:
dft_target=df['is_fraud']

In [None]:
df_mb['weekday_no'].unique()

In [None]:
df_mb

In [None]:
df_num=df_mb[['amt','city_pop','merch_lat','merch_long','day_no','min_day','hr_day']]

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
PT = PowerTransformer()

In [None]:
PT_Yeo = PowerTransformer(method='yeo-johnson')

In [None]:
df_yeo= PT_Yeo.fit_transform(df_num)

In [None]:
df_yeo1=pd.DataFrame(df_yeo,columns=df_num.columns)

In [None]:
df_yeo1

In [None]:
df.head(2)

In [None]:
dft_num=df[['amt','city_pop','merch_lat','merch_long','day_no','min_day','hr_day']]

In [None]:
dft_yeo= PT_Yeo.transform(dft_num)

In [None]:
dft_yeo1=pd.DataFrame(dft_yeo,columns=dft_num.columns)

In [None]:
dft_yeo1

In [None]:
df_yeo1['weekday_no']=df_mb['weekday_no']

In [None]:
dft_yeo1['weekday_no']=df_mb['weekday_no']

In [None]:
df_yeo1['month']=df_mb['month']

In [None]:
dft_yeo1['month']=df_mb['month']

In [None]:
df_yeo1

In [None]:
dft_yeo1

In [None]:
df_encoded=pd.get_dummies(df_mb[['category','gender','year','age_group']],drop_first=True)

In [None]:
dft_encoded=pd.get_dummies(df[['category','gender','year','age_group']],drop_first=True)

In [None]:
df_cat=df_encoded.drop('year',axis=1)

In [None]:
dft_cat=dft_encoded.drop('year',axis=1)

In [None]:
df_cat

In [None]:
dft_cat

In [None]:
df_final=pd.concat([df_yeo1,df_cat],axis=1)

In [None]:
dft_final=pd.concat([dft_yeo1,dft_cat],axis=1)

In [None]:
df_final


In [None]:
dft_final

# Assumptions

In [None]:
1.Independent rows
2.Log(odds)is a linear
3.no multi collinearity
4.lack of strongly influential outliers

# base model

# train test split

In [None]:
X=df_final
xtrain=sm.add_constant(X)
ytrain=df_target

In [None]:
X=dft_final
xtest=sm.add_constant(X)
ytest=dft_target

In [None]:
import statsmodels.api as sm

In [None]:
logreg=sm.Logit(ytrain,xtrain).fit()
print(logreg.summary())

In [None]:
ytrain_prob=logreg.predict(xtrain)
ypred_train=[0 if x < 0.5 else 1 for x in ytrain_prob]

In [None]:
ytest_prob=logreg.predict(xtest)
ypred_test=[0 if x < 0.5 else 1 for x in ytest_prob]

In [None]:
from sklearn.metrics import roc_curve

In [None]:
fpr,tpr,thresholds=roc_curve(ytrain,ytrain_prob)

In [None]:
fpr,tpr,thresholds=roc_curve(ytest,ytest_prob)

In [None]:
roc_auc_score(ytrain, ytrain_prob)

In [None]:
roc_auc_score(ytest, ytest_prob)

In [None]:
fpr, tpr, thresholds = roc_curve(ytrain, ytrain_prob)

# plot the ROC curve
plt.plot(fpr, tpr)

# set limits for x and y axes
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

# plot the straight line showing worst prediction for the model
plt.plot([0, 1], [0, 1],'r--')

# add plot and axes labels
# set text size using 'fontsize'
plt.title('ROC curve for fraud transaction (train data set)', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)

# add the AUC score to the plot
# 'x' and 'y' gives position of the text
# 's' is the text 
# use round() to round-off the AUC score upto 4 digits
plt.text(x = 0.02, y = 0.9, s = ('AUC Score:', round(roc_auc_score(ytrain, ytrain_prob),4)))
                               
# plot the grid
plt.grid(True)

In [None]:
fpr, tpr, thresholds = roc_curve(ytest, ytest_prob)

# plot the ROC curve
plt.plot(fpr, tpr)

# set limits for x and y axes
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

# plot the straight line showing worst prediction for the model
plt.plot([0, 1], [0, 1],'r--')

# add plot and axes labels
# set text size using 'fontsize'
plt.title('ROC curve for fraud transaction (test dataset)', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)

# add the AUC score to the plot
# 'x' and 'y' gives position of the text
# 's' is the text 
# use round() to round-off the AUC score upto 4 digits
plt.text(x = 0.02, y = 0.9, s = ('AUC Score:', round(roc_auc_score(ytest, ytest_prob),4)))
                               
# plot the grid
plt.grid(True)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,cohen_kappa_score
from matplotlib.colors import ListedColormap

In [None]:
cm = confusion_matrix(ytrain, ypred_train)

# label the confusion matrix  
# pass the matrix as 'data'
# pass the required column names to the parameter, 'columns'
# pass the required row names to the parameter, 'index'
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

# plot a heatmap to visualize the confusion matrix
# 'annot' prints the value of each grid 
# 'fmt = d' returns the integer value in each grid
# 'cmap' assigns color to each grid
# as we do not require different colors for each grid in the heatmap,
# use 'ListedColormap' to assign the specified color to the grid
# 'cbar = False' will not return the color bar to the right side of the heatmap
# 'linewidths' assigns the width to the line that divides each grid
# 'annot_kws = {'size':25})' assigns the font size of the annotated text 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
            linewidths = 0.1, annot_kws = {'size':25})

# set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 20)

# set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 20)

# display the plot
plt.title('Confusion matrix for train dataset')
plt.show()

In [None]:
cm = confusion_matrix(ytest, ypred_test)

# label the confusion matrix  
# pass the matrix as 'data'
# pass the required column names to the parameter, 'columns'
# pass the required row names to the parameter, 'index'
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

# plot a heatmap to visualize the confusion matrix
# 'annot' prints the value of each grid 
# 'fmt = d' returns the integer value in each grid
# 'cmap' assigns color to each grid
# as we do not require different colors for each grid in the heatmap,
# use 'ListedColormap' to assign the specified color to the grid
# 'cbar = False' will not return the color bar to the right side of the heatmap
# 'linewidths' assigns the width to the line that divides each grid
# 'annot_kws = {'size':25})' assigns the font size of the annotated text 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
            linewidths = 0.1, annot_kws = {'size':25})

# set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 20)

# set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 20)

# display the plot
plt.title('Confusion matrix for test dataset')
plt.show()

In [None]:
acc_table = classification_report(ytrain, ypred_train)

# print the table

print('Train Dataset Classification report:')
print(acc_table)

In [None]:
acc_table = classification_report(ytest, ypred_test)

# print the table
print('Test Dataset Classification report:')
print(acc_table)

In [None]:
df_odds = pd.DataFrame(np.exp(logreg.params), columns= ['Odds']) 

# print the dataframe
df_odds.sort_values('Odds',ascending=False).head(5)

In [None]:
df_odds.values

In [None]:
df_odds.index

In [None]:
for i in df_odds.values:
    for j in df_odds.index:
        
        print(
'odds_',j,' =', i,' it implies that the odds of detecting a fraud transaction  increases by a factor of ',i,' due to one unit increase in the', j,' keeping other variables constant'
)

In [None]:
kappa = cohen_kappa_score(y, ypred)

# print the kappa value
print('kappa value:',kappa)

In [None]:
print('AIC:', logreg.aic)

# xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
x_train=df_final
ytrain=df_target

In [None]:
x_tests=dft_final
ytest=dft_target

In [None]:
xg=XGBClassifier(max_depth=200,gamma=0.2,learning_rate=0.2)

In [None]:
model=xg.fit(x_train,ytrain)

In [None]:
ytrain_pred=model.predict(x_train)
ytest_pred=model.predict(x_tests)

In [None]:
acc_table = classification_report(ytrain, ytrain_pred)

# print the table

print('Train Dataset Classification report:')
print(acc_table)

In [None]:
roc_auc_score(ytrain,ytrain_pred)

In [None]:
acc_table = classification_report(ytest, ytest_pred)

# print the table

print('Test Dataset Classification report:')
print(acc_table)

In [None]:
roc_auc_score(ytest,ytest_pred)

In [None]:
#confusion_matrix

In [None]:
cm = confusion_matrix(ytrain, ytrain_pred)

conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
            linewidths = 0.1, annot_kws = {'size':25})

plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.title('Confusion matrix for train dataset')
plt.show()

In [None]:
cm = confusion_matrix(ytest, ytest_pred)
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
            linewidths = 0.1, annot_kws = {'size':25})

plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.title('Confusion matrix for test dataset')
plt.show()

In [None]:
from sklearn.model_selection import  KFold,cross_val_score

In [None]:
#bias error, variance error
kf = KFold(n_splits = 10, shuffle = True, random_state = 0)
scores = cross_val_score(xg,x_train,ytrain, cv = kf, scoring = 'roc_auc')

print('Bias Error:',1-np.mean(scores))
print('Variance Error:',np.std(scores, ddof = 1))

In [None]:
#roc_curve
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
fpr, tpr, thresholds = roc_curve(ytrain, ytrain_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.plot([0, 1], [0, 1],'r--')
plt.title('ROC curve for fraud transaction (train data set)', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)

plt.text(x = 0.02, y = 0.9, s = ('AUC Score:', round(roc_auc_score(ytrain, ytrain_pred),4)))
plt.grid(True)

In [None]:
fpr, tpr, thresholds = roc_curve(ytest, ytest_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.plot([0, 1], [0, 1],'r--')
plt.title('ROC curve for fraud transaction (test dataset)', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
plt.text(x = 0.02, y = 0.9, s = ('AUC Score:', round(roc_auc_score(ytest, ytest_pred),4)))
plt.grid(True)

In [None]:
plt.figure(figsize=(15,8))
important_features = pd.DataFrame({'Features': x_train.columns,
                                  'Importance':model.feature_importances_})
important_features = important_features.sort_values('Importance', ascending = False)
sns.barplot(x = 'Importance', y = 'Features', data = important_features)
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)
plt.show()

In [None]:
important_features

In [None]:
########################33

# Gradient boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
x_train=df_final
ytrain=df_target

In [None]:
x_tests=dft_final
ytest=dft_target

In [None]:
gb=GradientBoostingClassifier(max_depth=200,learning_rate=0.2)

In [None]:
model1=gb.fit(x_train,ytrain)

In [None]:
ytrain_pred1=model1.predict(x_train)
ytest_pred1=model1.predict(x_tests)

In [None]:
acc_table = classification_report(ytrain, ytrain_pred1)

# print the table

print('Train Dataset Classification report:')
print(acc_table)

In [None]:
acc_table = classification_report(ytest, ytest_pred1)

# print the table

print('Test Dataset Classification report:')
print(acc_table)

In [None]:
#confusion matrix

In [None]:
cm = confusion_matrix(ytrain, ytrain_pred1)

conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
            linewidths = 0.1, annot_kws = {'size':25})

plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.title('Confusion matrix for train dataset')
plt.show()

In [None]:
cm = confusion_matrix(ytest, ytest_pred1)
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
            linewidths = 0.1, annot_kws = {'size':25})
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.title('Confusion matrix for test dataset')
plt.show()

In [None]:
#bias , variance error

In [None]:
#roc_curve

In [None]:
fpr, tpr, thresholds = roc_curve(ytrain, ytrain_pred1)

plt.plot(fpr, tpr)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.plot([0, 1], [0, 1],'r--')
plt.title('ROC curve for fraud transaction (train data set)', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
plt.text(x = 0.02, y = 0.9, s = ('AUC Score:', round(roc_auc_score(ytrain, ytrain_pred1),4)))
plt.grid(True)

In [None]:
fpr, tpr, thresholds = roc_curve(ytest, ytest_pred1)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.plot([0, 1], [0, 1],'r--')
plt.title('ROC curve for fraud transaction (test dataset)', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
plt.text(x = 0.02, y = 0.9, s = ('AUC Score:', round(roc_auc_score(ytest, ytest_pred1),4)))
plt.grid(True)

In [None]:
#########################################