# ⚑ Importing Libraries

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import random
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline
# Models Libraries
import cufflinks as cf
import plotly.express as px
import plotly.offline as py
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Evaluation Metric Libraries
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, precision_score, recall_score,f1_score, roc_auc_score

# Warnings
import warnings
warnings.filterwarnings('ignore')

# ⚑ Reading and exploring the dataset

In [None]:
df = pd.read_csv(r'C:\Users\etsh2\Downloads\archive (8)\Transaction.csv')

In [None]:
df.shape 

In [None]:
df.head() 

In [None]:
df.tail()

In [None]:
df.dtypes

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
# Assuming df is your original DataFrame
sample_df = df.sample(frac=0.15)  # Sample 15% of the rows

null_rows = int(0.1 * len(sample_df))  # Sample 10% of the rows
null_indices = random.sample(range(len(sample_df)), null_rows)

for i in null_indices:
    sample_df.iloc[i, :] = np.nan

# Concatenate the original dataframe with the sampled dataframe
df = pd.concat([df, sample_df])

In [None]:
df.head(10)

In [None]:
df.tail(20)

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

# ⚑ Handling Missing Values

In [None]:
if df.isnull().values.any():
    print('There are missing values in the dataset\n')
    df.dropna(inplace=True)
    print('Shape : ', df.shape) 
else:
    print('There aren\'t missing values in the dataset.')

In [None]:
if df.duplicated().any():
    print('There are duplicate values in the dataset\n')
    df.drop_duplicates(inplace=True)
    print('Shape after removing duplicates: ', df.shape)
else:
    print('There aren\'t duplicate values in the dataset.')

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df.shape

# ⚑ Data Visualization

# Histogram Distribution of Amounts

In [None]:
import plotly.express as px
fig = px.histogram(df, x='amount', title='Distribution of Transaction Amounts')
fig.show()

# Pie Chart for Transaction Type

In [None]:
# Distribution of Transactions Types
type = df["type"].value_counts()

Names = type.index
Value = type.values

figure = px.pie(df,
                values=Value, 
                names=Names,
                title="Distribution of Transaction Type")
figure.show()

# Scatter Plot between oldbalanceOrg and newbalanceOrig

In [None]:
# Scatter Plot
sample_df = df.sample(n=80000, random_state=42)
scatter_fig = px.scatter(sample_df, x='oldbalanceOrg', y='newbalanceOrig', 
                         title='Scatter Plot of Old Balance Orig vs New Balance Orig')
scatter_fig.show()

In [None]:
import plotly.express as px

sample_df = df.sample(n=80000, random_state=42)

fig = px.scatter(sample_df, x='oldbalanceOrg', y='amount',
                 color='isFraud', color_discrete_map={'Fraud': 'Yellow', 'NO Fraud':'Blue'},
                 size_max=100, title='Relationship between Old Balance Org, New Balance Org',
                 symbol='step')

fig.update_layout(xaxis_title='Old Balance Org', yaxis_title='New Balance Org')

fig.show()


In [None]:
sample_df = df.sample(n=100000, random_state=42)
fig = px.scatter(sample_df, x='step', y='amount', color='isFraud',
                 labels={'amount': 'Transaction Amount', 'oldbalanceOrg': 'Old Balance Org'},
                 title='Transaction Amount vs. Step vs. isFraud')

fig.update_layout(width=800, height=600)
fig.show()

# Box Plot

In [None]:
sample_df = df.sample(n=1500, random_state=42)
max_oldbalanceOrg = sample_df['oldbalanceOrg'].max()
print("Max value in 'oldbalanceOrg':", max_oldbalanceOrg)


In [None]:
sample_df = df.sample(n=1500, random_state=42)
fig = px.box(sample_df, y=['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'])
fig.show()

In [None]:
import pandas as pd

# Assuming df is your DataFrame
filtered_df = sample_df.loc[df['isFraud'] == 0].head(1500)
min_value = filtered_df['amount'].min()
min_value

In [None]:
import plotly.express as px

sample_df = df.sample(n=1500, random_state=42)
fig = px.box(sample_df, x="isFraud", y="amount")
fig.update_layout(title_text="Box Plot of Amount by Fraud Status")
fig.show()

# Bubble Plot

In [None]:
sample_df = df.sample(n=20000, random_state=42)
bubble_fig = px.scatter(sample_df, x='oldbalanceDest', y='newbalanceDest', 
                        size='amount', color='isFraud',
                        title='Bubble Plot of Old Balance Orig vs New Balance Orig with Amount and Fraud')
bubble_fig.show()

# Area Plot

In [None]:
# Assuming df is your dataframe containing the data
sample_df = df.sample(n=50000, random_state=42)

# Create area plot with 'isFraud' as color
fig_area = px.area(sample_df, x='step', y='amount', title='Area Plot of Amount over Steps', color='isFraud')
fig_area.show()


In [None]:
import plotly.graph_objects as go

# Assuming df is a Pandas DataFrame
fig = go.Figure(data=[go.Histogram(x=df.values, nbinsx=50)])

# Customize the figure
fig.update_layout(title='Histogram of Data', xaxis_title='Value', yaxis_title='Frequency')

# Set the figure size
fig.update_layout(width=25*72, height=20*72)

# Show the figure
fig.show()

# ⚑ Analysis and exploration of categories of the "type" feature

In [None]:
# check labels in "type" feature
df['type'].unique()

In [None]:
df['type'].value_counts()

In [None]:
sample_df = df.sample(n=50000, random_state=42)

type_counts = sample_df['type'].value_counts().reset_index()
type_counts.columns = ['type', 'count']

# Create a line plot
fig = px.line(type_counts, x='type', y='count', title='Line Plot of Type Counts')
fig.show()


# ⚑ Label Encoding

In [None]:
df['type'].replace({"CASH_OUT" : 1 , "PAYMENT" : 2 , "CASH_IN" : 3 , "TRANSFER" : 4 , "DEBIT" : 5  }, inplace=True)

In [None]:
df['type'].value_counts()

In [None]:
df["isFraud"] = df["isFraud"].replace({ 0: "NO Fraud" , 1: "Fraud" })

In [None]:
df['isFraud'].value_counts()

In [None]:
import plotly.express as px
fig = px.pie(df['isFraud'].value_counts(), names=df['isFraud'].value_counts().index, values=df['isFraud'].value_counts(), title='Distribution of isFraud')
fig.show()

# ⚑ Analysis and exploration of categories of the "isFraud" feature

In [None]:
df['isFraud'].unique()

In [None]:
df['isFraud'].value_counts()

In [None]:
# Assuming 'df' is your DataFrame and 'isFraud' is the column containing 'NO Fraud' and 'Fraud'
fraud_counts = df['isFraud'].value_counts()

# Get the counts for 'NO Fraud' and 'Fraud'
no_fraud_count = fraud_counts.get('NO Fraud', 6354407)
fraud_count = fraud_counts.get('Fraud', 8213)

print(f"Number of 'NO Fraud': {no_fraud_count}")
print(f"Number of 'Fraud': {fraud_count}")


In [None]:
categories = ['isFraud']
values1 = [no_fraud_count]  
values2 = [fraud_count]     

fig = go.Figure()
fig.add_trace(go.Bar(x=categories, y=values1, name='NO Fraud'))
fig.add_trace(go.Bar(x=categories, y=values2, name='Fraud'))
fig.update_layout(title='Target Counts \n (isn\'t Fraud = 0 || is Fraud = 1)', xaxis_title='Categories', yaxis_title='Values',
                  barmode='group')
fig.show()

# ⚑ Dropping unnecessary features

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

In [None]:
df.columns

In [None]:
df.shape

# ⚑ Correlation

In [None]:
df["isFraud"] = df["isFraud"].replace({ "NO Fraud" : 0 , "Fraud" : 1 })

In [None]:
df.head()

In [None]:
correlation = df.corr()
correlation_values = correlation['isFraud'].sort_values(ascending=False)
correlation_values

In [None]:
corr_matrix = df.corr()
heatmap = go.Heatmap(z=corr_matrix.values,
                     x=corr_matrix.columns,
                     y=corr_matrix.columns)

layout = go.Layout(
    title='Correlation Matrix',
    width=600,  
    height=400  
)

fig = go.Figure(data=[heatmap], layout=layout)
py.iplot(fig) 

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(9,9))  
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', square=True)
plt.show()

# ⚑ Bar Chart for <b>correlation</b>

In [None]:
data = {
    'Column': correlation_values.index,
    'Correlation': correlation_values.values
}
df_plot = pd.DataFrame(data)

# Creating the bar chart
fig = px.bar(df_plot, x='Column', y='Correlation', title='Correlation with isFraud')
fig.show()

# ⚑ Splitting the data and target

In [None]:
x = np.array(df[[ "type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(df[["isFraud"]])

# ⚑ Data Partitioning / Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

# Splitted Data
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)

In [None]:
y_test = pd.Series(y_test.flatten())
value_counts = y_test.value_counts()
print(value_counts)

# ⚑ Data Scaling / Feature Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ⚑ Applying "Logistic Regression" Algorithm

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model_LR = LogisticRegression()

model_LR.fit(X_train, y_train)

y_pred_LR = model_LR.predict(X_test)

Train_Accuracy = accuracy_score(y_train, model_LR.predict(X_train))
Test_Accuracy = accuracy_score(y_test, y_pred_LR)

print(f'Training accuracy: {Train_Accuracy*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy*100:.2f} %')

# ⚑ The Evaluation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(6, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_LR), annot=True, cmap="Reds", fmt="d")

plt.title("Confusion Matrix")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")

plt.show()


In [None]:
classification_report_LR = print(classification_report(y_test, y_pred_LR))

# ⚑ Applying "Decision Tree" Algorithm

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Model_DT = DecisionTreeClassifier()

Model_DT.fit(X_train, y_train)

y_pred_DT = Model_DT.predict(X_test)

Train_Accuracy = accuracy_score(y_train, Model_DT.predict(X_train))
Test_Accuracy = accuracy_score(y_test, y_pred_DT)

print(f'Training accuracy: {Train_Accuracy*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy*100:.2f} %')

# ⚑ The Evaluation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(6, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_DT), annot=True, cmap="Reds", fmt="d")

plt.title("Confusion Matrix")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")

plt.show()


In [None]:
classification_report_DT = print(classification_report(y_test, y_pred_DT))

# ⚑ Applying "Naive Bayes" Algorithm

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
Model_NB = GaussianNB()

# Train the model
Model_NB.fit(X_train, y_train)

# Make predictions
y_pred_NB = Model_NB.predict(X_test)

# Calculate accuracies
Train_Accuracy = accuracy_score(y_train, Model_NB.predict(X_train))
Test_Accuracy = accuracy_score(y_test, y_pred)

# Print accuracies
print(f'Training accuracy: {Train_Accuracy*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy*100:.2f} %')

# ⚑ The Evaluation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(6, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_NB), annot=True, cmap="Reds", fmt="d")

plt.title("Confusion Matrix")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")

plt.show()


In [None]:
classification_report_NB = print(classification_report(y_test, y_pred_NB))

# ⚑ Applying "RandomForestClassifier" Algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)

Train_Accuracy_rfc = accuracy_score(y_train, rfc.predict(X_train))
Test_Accuracy_rfc = accuracy_score(y_test, y_pred_rfc)

print(f'Training accuracy: {Train_Accuracy_rfc*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy_rfc*100:.2f} %')

# ⚑ The Evaluation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(6, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_rfc), annot=True, cmap="Reds", fmt="d")

plt.title("Confusion Matrix")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")

plt.show()


In [None]:
classification_report_DT = print(classification_report(y_test, y_pred_rfc))

# ⚑ Applying "KNN" Algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

Train_Accuracy_knn = accuracy_score(y_train, knn.predict(X_train))
Test_Accuracy_knn = accuracy_score(y_test, y_pred_knn)

print(f'Training accuracy: {Train_Accuracy_knn*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy_knn*100:.2f} %')

# ⚑ The Evaluation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(6, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot=True, cmap="Reds", fmt="d")

plt.title("Confusion Matrix")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")

plt.show()


In [None]:
classification_report_DT = print(classification_report(y_test, y_pred_knn))

# ⚑ Applying "XGB" Algorithm

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

Train_Accuracy_xgb = accuracy_score(y_train, xgb_model.predict(X_train))
Test_Accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f'Training accuracy: {Train_Accuracy_xgb*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy_xgb*100:.2f} %')

# ⚑ The Evaluation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(6, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, cmap="Reds", fmt="d")

plt.title("Confusion Matrix")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")

plt.show()


In [None]:
classification_report_DT = print(classification_report(y_test, y_pred_xgb))

# Pick the best model

In [None]:
evaluation = pd.DataFrame({'Model': ['Logistic Regression','Decision Tree', 'Naive Bayes' , 'RandomForestClassifier' , 'KNN' , 'XGB'],
                           'Accuracy': [(Train_Accuracy*100), (Train_Accuracy*100), (Train_Accuracy*100) , (Train_Accuracy_rfc*100) , (Train_Accuracy_knn*100)
                                       , (Train_Accuracy_xgb*100)]})

evaluation

In [None]:
import plotly.express as px

# Assuming evaluation is your DataFrame
fig = px.bar(evaluation, x='Model', y='Accuracy', title='Models', 
             labels={'Accuracy': 'Accuracy', 'Model': 'Models'},
             color_discrete_sequence=px.colors.sequential.Viridis)
fig.show()


# Prediction for new data

In [None]:
new_data = [[2,9839.64,170136.0,160296.36]]
if  model.predict(new_data)[0] == 0:
    print('not Fraud')
else: print('is Fraud')

# Cluster

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

K_values = range(1, 4)
sse = []

for K in K_values:
    kmeans = KMeans(n_clusters=K)
    kmeans.fit(X_train)
    sse.append(kmeans.inertia_)

plt.plot(K_values, sse)
plt.xlabel('Number of clusters (K)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df.iloc[:,3], y=df.iloc[:,4], hue=df["isFraud"].array)
plt.title('Scatter Plot of Clusters')
plt.show()

In [None]:
sns.pairplot(df)
plt.show()