<a href="https://colab.research.google.com/github/Lauri-Litovuo/AI-Machine-Learning-Training/blob/main/Medium02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter


#installing plotly
!pip install plotly
import plotly.graph_objects as go
import plotly.express as px

from imblearn.over_sampling import SMOTE



In [None]:
# Read data
file = 'drive/My Drive/Colab/Transactions.csv'
df = pd.read_csv(file)

print(df.head())

   Unnamed: 0.1  Unnamed: 0  TRANSACTION_ID          TX_DATETIME  CUSTOMER_ID  \
0             0           0               0  2023-01-01 00:00:31          596   
1             1           1               1  2023-01-01 00:02:10         4961   
2             2           2               2  2023-01-01 00:07:56            2   
3             3           3               3  2023-01-01 00:09:29         4128   
4             4           4               4  2023-01-01 00:10:34          927   

   TERMINAL_ID  TX_AMOUNT  TX_TIME_SECONDS  TX_TIME_DAYS  \
0         3156     533.07               31             0   
1         3412     808.56              130             0   
2         1365    1442.94              476             0   
3         8737     620.65              569             0   
4         9906     490.66              634             0   

                 TX_FRAUD_SCENARIO  
0           Legitimate Transaction  
1           Legitimate Transaction  
2  Fraudulent Transaction Internet  
3   

In [None]:
# Data exploration
column_names = list(df.columns)
print(column_names)
dimensions = df.shape
print("shape", dimensions)
num_rows = df.shape[0]
print(num_rows)
print(df.info())
fraud_scenarios = df['TX_FRAUD_SCENARIO'].unique()
print(fraud_scenarios)

['Unnamed: 0.1', 'Unnamed: 0', 'TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD_SCENARIO']
shape (1754155, 10)
1754155
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0.1       int64  
 1   Unnamed: 0         int64  
 2   TRANSACTION_ID     int64  
 3   TX_DATETIME        object 
 4   CUSTOMER_ID        int64  
 5   TERMINAL_ID        int64  
 6   TX_AMOUNT          float64
 7   TX_TIME_SECONDS    int64  
 8   TX_TIME_DAYS       int64  
 9   TX_FRAUD_SCENARIO  object 
dtypes: float64(1), int64(7), object(2)
memory usage: 133.8+ MB
None
['Legitimate Transaction' 'Fraudulent Transaction Internet'
 'Fraudulent Transaction retailer' 'Fraudulent Transaction DAB']


In [None]:
# Formatting the counts
def format_large_number(number):
    if number >= 1_000_000:
        return f"{number / 1_000_000:.6f}M"
    elif number >= 10_000:
        return f"{number / 1_000:.3f}k"
    else:
        return str(number)

def plot_fraud_scenario_counts(data_frame):
    # Getting the values and sorting them alphabetically
    fraud_counts = data_frame['TX_FRAUD_SCENARIO'].value_counts()
    fraud_counts = fraud_counts.sort_index()
    x_values = fraud_counts.index.tolist()
    y_values = fraud_counts.values.tolist()


    # Creating the bar plot with different colors for each bar
    colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA']
    bars = []
    for i in range(len(x_values)):
        formatted_count = format_large_number(y_values[i])
        bars.append(go.Bar(
          x=[x_values[i]],
          y=[y_values[i]],
          name=x_values[i],
          marker_color=colors[i % len(colors)],
          hoverinfo='skip',
          hovertemplate=f"Fraud Scenario= {x_values[i]}<br>Number of Transactions= {formatted_count}<extra></extra>",
          ))

    # Making the barplot
    fig = go.Figure(data=bars)
    fig.update_layout(
                  xaxis_title='Fraud Scenario',
                  yaxis_title='Number of Transactions',
                  showlegend=True)
    fig.show()

# Calling the function to plot fraud scenario counts
plot_fraud_scenario_counts(df)


In [None]:
def categorize_transaction(scenario):
  if scenario in ['Fraudulent Transaction Internet', 'Fraudulent Transaction retailer', 'Fraudulent Transaction DAB']:
    return 'Fraudulent Transaction(Internet + DAB + Retailer)'
  elif scenario == 'Legitimate Transaction':
    return 'Legitimate Transaction'
  else:
    return scenario

def plot_data_imbalance(data_frame, target_column):
  data_frame['Transaction_Category'] = data_frame[target_column].apply(categorize_transaction)
  #print(data_frame['Transaction_Category'])
  category_counts = data_frame['Transaction_Category'].value_counts().reset_index()
  category_counts.columns = ['Transaction_Category', 'Count']
  fig = px.pie(category_counts,
                 names='Transaction_Category',
                 values='Count')
  fig.show()

plot_data_imbalance(df, 'TX_FRAUD_SCENARIO')

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Unnamed: 0.1          int64  
 1   Unnamed: 0            int64  
 2   TRANSACTION_ID        int64  
 3   TX_DATETIME           object 
 4   CUSTOMER_ID           int64  
 5   TERMINAL_ID           int64  
 6   TX_AMOUNT             float64
 7   TX_TIME_SECONDS       int64  
 8   TX_TIME_DAYS          int64  
 9   TX_FRAUD_SCENARIO     object 
 10  Transaction_Category  object 
dtypes: float64(1), int64(7), object(3)
memory usage: 147.2+ MB
None


In [None]:
def change_scenario_to_numeric(scenario):
  if scenario == 'Legitimate Transaction':
    return 0
  else:
    return 1

def group_transactions_by_month(data_frame):
  data_frame['TX_DATETIME_COPY'] = data_frame['TX_DATETIME']
  data_frame['TX_DATETIME_COPY'] = pd.to_datetime(data_frame['TX_DATETIME_COPY'])
  data_frame['Month'] = data_frame['TX_DATETIME_COPY'].dt.month
  monthly_fraud_counts = data_frame.groupby(['Month', 'Transaction_Type']).size().reset_index(name='Count')
  #print(data_frame.head())
  #print(monthly_fraud_counts)
  return monthly_fraud_counts

def visualize_monthly_transactions(df):
  df['Transaction_Type'] = df['Transaction_Category'].apply(change_scenario_to_numeric)
  monthly_counts = group_transactions_by_month(df)

  fig = px.bar(
    monthly_counts,
    x='Month',
    y='Count',
    color='Transaction_Type',
    barmode='stack',
    title="Distribution of Transactions and Fraud Scenario per Month",
    labels={"Month": "Month", "Count": "Number of Transactions"},
    hover_name='Transaction_Type',
    hover_data={'Count': True}
    )
    # Show the plot
  fig.show()


# Call the function to visualize monthly transactions and fraud scenarios
visualize_monthly_transactions(df)

In [None]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD_SCENARIO,Transaction_Category,Transaction_Type,TX_DATETIME_COPY,Month
0,0,0,0,2023-01-01 00:00:31,596,3156,533.07,31,0,Legitimate Transaction,Legitimate Transaction,0,2023-01-01 00:00:31,1
1,1,1,1,2023-01-01 00:02:10,4961,3412,808.56,130,0,Legitimate Transaction,Legitimate Transaction,0,2023-01-01 00:02:10,1
2,2,2,2,2023-01-01 00:07:56,2,1365,1442.94,476,0,Fraudulent Transaction Internet,Fraudulent Transaction(Internet + DAB + Retailer),1,2023-01-01 00:07:56,1
3,3,3,3,2023-01-01 00:09:29,4128,8737,620.65,569,0,Legitimate Transaction,Legitimate Transaction,0,2023-01-01 00:09:29,1
4,4,4,4,2023-01-01 00:10:34,927,9906,490.66,634,0,Legitimate Transaction,Legitimate Transaction,0,2023-01-01 00:10:34,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 14 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Unnamed: 0.1          int64         
 1   Unnamed: 0            int64         
 2   TRANSACTION_ID        int64         
 3   TX_DATETIME           object        
 4   CUSTOMER_ID           int64         
 5   TERMINAL_ID           int64         
 6   TX_AMOUNT             float64       
 7   TX_TIME_SECONDS       int64         
 8   TX_TIME_DAYS          int64         
 9   TX_FRAUD_SCENARIO     object        
 10  Transaction_Category  object        
 11  Transaction_Type      int64         
 12  TX_DATETIME_COPY      datetime64[ns]
 13  Month                 int32         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(8), object(3)
memory usage: 180.7+ MB


In [None]:
numeric_df = df.select_dtypes(include=['number'])
numeric_df.head()
#print(numeric_df.info())

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,Transaction_Type,Month
0,0,0,0,596,3156,533.07,31,0,0,1
1,1,1,1,4961,3412,808.56,130,0,0,1
2,2,2,2,2,1365,1442.94,476,0,1,1
3,3,3,3,4128,8737,620.65,569,0,0,1
4,4,4,4,927,9906,490.66,634,0,0,1


In [None]:
def balance_data_with_smote(df, target_column):
  # Getting the X and y
  feature_columns = [col for col in df.columns if col not in [target_column, 'TX_FRAUD_SCENARIO', 'Transaction_Category']]
  X=df[feature_columns]
  y=df[target_column]

  # Applying SMOTE
  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)

  # Making the balanced dataset and returning it
  balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
  balanced_df[target_column] = y_resampled
  return balanced_df


# Balance the dataset using SMOTE based on the 'TX_FRAUD_SCENARIO' column
balanced_df = balance_data_with_smote(numeric_df, 'Transaction_Type')
balanced_df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,Month,Transaction_Type
count,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0
mean,878610.5,878610.5,878610.5,2511.075,4987.835,919.2404,7917061.0,91.13206,3.550648,0.5
std,506081.8,506081.8,506081.8,1417.973,2861.126,1958.037,4562477.0,52.80579,1.738114,0.5
min,0.0,0.0,0.0,0.0,0.0,0.0,31.0,0.0,1.0,0.0
25%,440303.0,440303.0,440303.0,1276.0,2503.0,347.34,3952942.0,45.0,2.0,0.0
50%,878837.5,878837.5,878837.5,2515.0,4980.0,926.15,7911678.0,91.0,4.0,0.5
75%,1317011.0,1317011.0,1317011.0,3746.0,7480.0,1225.837,11873850.0,137.0,5.0,1.0
max,1754154.0,1754154.0,1754154.0,4999.0,9999.0,647837.5,15811200.0,182.0,7.0,1.0


In [None]:
def plot_balanced_data_imbalance(data_frame, target_column):
  category_counts = data_frame[target_column].value_counts().reset_index()
  category_counts.columns = [target_column, 'Count']
  category_counts[target_column] = category_counts[target_column].map({1: 'Fraudulent Transaction', 0: 'Legitimate Transaction'})
  fig = px.pie(category_counts,
                 names='target_column',
                 values='Count')

  fig.show()
# Call the function to display the class imbalance in the 'TX_FRAUD_SCENARIO' column
plot_data_imbalance(balanced_df, 'Transaction_Type')

In [None]:
# It's up to you to continue
# Remember to look in the module subject for more information on what to do next and on the benefits of this module, the following exercises are no longer guided.

In [None]:
balanced_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,Month,Transaction_Type,Transaction_Category
0,0,0,0,596,3156,533.07,31,0,1,0,0
1,1,1,1,4961,3412,808.56,130,0,1,0,0
2,2,2,2,2,1365,1442.94,476,0,1,1,1
3,3,3,3,4128,8737,620.65,569,0,1,0,0
4,4,4,4,927,9906,490.66,634,0,1,0,0


In [209]:
for col in balanced_df.columns:
  if col == 'Transaction_Type':
    balanced_df.drop('Transaction_Type', axis=1, inplace=True)
df_final = balanced_df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'TRANSACTION_ID'], axis=1)
df_final.describe()

Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,Month,Transaction_Category
count,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0,3036372.0
mean,2511.075,4987.835,919.2404,7917061.0,91.13206,3.550648,0.5
std,1417.973,2861.126,1958.037,4562477.0,52.80579,1.738114,0.5
min,0.0,0.0,0.0,31.0,0.0,1.0,0.0
25%,1276.0,2503.0,347.34,3952942.0,45.0,2.0,0.0
50%,2515.0,4980.0,926.15,7911678.0,91.0,4.0,0.5
75%,3746.0,7480.0,1225.837,11873850.0,137.0,5.0,1.0
max,4999.0,9999.0,647837.5,15811200.0,182.0,7.0,1.0


In [217]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb

X = df_final.loc[:, df_final.columns != 'Transaction_Category']
y = df_final['Transaction_Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

#model = LogisticRegression()
#model.fit(X_train, y_train)
#prediction_lr = model.predict(X_test)
#accuracy = metrics.accuracy_score(prediction_lr, y_test)
#print("Accuracy:", accuracy)

#Decision tree classifier
#clf = DecisionTreeClassifier()
#clf.fit(X_train, y_train)
#prediction = clf.predict(X_test)
#accuracy = metrics.accuracy_score(prediction, y_test)
#print("Accuracy:", accuracy)

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_clf.fit(X_train, y_train)
prediction = xgb_clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, prediction)
print("Accuracy:", accuracy)

print("Decision tree acc: Accuracy: 0.9661449184992623")
print("Decision tree acc: Accuracy: 0.9662492281091762")
print("Decision tree acc: Accuracy: 0.9664501255609074")
print("Xgboost tree acc: Accuracy: 0.979867440615866")
print("Xgboost tree acc: Accuracy: 0.9795194266844657")


Parameters: { "use_label_encoder" } are not used.




Accuracy: 0.9795194266844657
Decision tree acc: Accuracy: 0.9661449184992623
Decision tree acc: Accuracy: 0.9662492281091762
Decision tree acc: Accuracy: 0.9664501255609074
Xgboost tree acc: Accuracy: 0.979867440615866
