In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
train_data = pd.read_csv('/home/hariom/Downloads/CreditCardDataset/fraudTrain.csv')

In [3]:
train_data.shape

(1296675, 23)

In [4]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
train_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [6]:
train_data['is_fraud'].value_counts()

is_fraud
0    1289169
1       7506
Name: count, dtype: int64

In [7]:
legit = train_data[train_data.is_fraud==0]
fraud = train_data[train_data.is_fraud==1]
print(legit.shape)
print(fraud.shape)

(1289169, 23)
(7506, 23)


In [8]:
legit_sample = legit.sample(n=8000)
print(legit_sample.shape)

(8000, 23)


In [9]:
new_dataset = pd.concat([legit_sample,fraud],axis=0)
new_dataset.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
1169021,1169021,2020-05-03 09:31:04,180084576295055,"fraud_Greenholt, Jacobi and Gleason",gas_transport,58.7,David,Kirby,M,133 Alyssa Plains,...,28.7724,-96.4793,911,Archaeologist,1970-01-18,bb986dd72fbbab1bcbd7c245bb890b47,1367573464,29.078686,-97.140056,0
531087,531087,2019-08-17 00:01:03,4488941175228467,fraud_Stamm-Rodriguez,misc_pos,54.36,Jacqueline,Washington,F,138 Stephen Ridges Apt. 978,...,28.9254,-81.6801,11432,Private music teacher,1986-03-31,77f9a9a9954c2eed519610da889733b0,1345161663,28.407506,-81.077576,0
405201,405201,2019-07-02 19:40:06,344342339068828,fraud_Jakubowski Inc,home,47.21,Ruth,Fuller,F,37732 Joe Courts Apt. 752,...,31.3826,-81.4312,5989,Tax adviser,1967-05-05,806a9a2b04815b3a53a583d201de264e,1341258006,32.091268,-81.418997,0
1082091,1082091,2020-03-25 02:59:02,3565196229855512,fraud_Reichel Inc,entertainment,4.86,Adriana,Harvey,F,715 Joy Prairie,...,40.4815,-92.9951,3805,"Investment banker, corporate",1950-09-15,1dc8f60f88b3f5015d90e0057402eb29,1364180342,40.503919,-92.585961,0
798223,798223,2019-12-03 17:26:24,30551643947183,"fraud_Towne, Greenholt and Koepp",shopping_net,3.74,Morgan,Smith,F,1441 Bradley Place,...,35.1836,-81.4552,5621,Toxicologist,1973-11-14,7b0c516377e32d4be8f61ad890106c8b,1354555584,34.261597,-81.23974,0


In [10]:
new_dataset.shape

(15506, 23)

In [11]:
X = new_dataset.drop(columns=['Unnamed: 0','cc_num','trans_num','dob','street','gender','job','is_fraud','merchant','category','first','last','city','state','trans_date_trans_time'] ,axis=1)
Y = new_dataset['is_fraud']
print("Shape of X after dropping columns:", X.shape)

Shape of X after dropping columns: (15506, 8)


In [12]:
print("Shape of X before vectorization:", X.shape)
print("Shape of Y:", Y.shape)

Shape of X before vectorization: (15506, 8)
Shape of Y: (15506,)


In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [14]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(12404, 8) (3102, 8) (12404,) (3102,)


In [15]:
model = LogisticRegression()
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

In [17]:
train_accuracy = accuracy_score(Y_train, Y_train_pred)
test_accuracy = accuracy_score(Y_test, Y_test_pred)

In [18]:
print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')

Training Accuracy: 0.86
Test Accuracy: 0.86


In [19]:
def predict_fraud(user_input):
    
    input_df = pd.DataFrame([user_input])
    
   
    input_df = pd.get_dummies(input_df, drop_first=True)
    
    
    input_df = input_df.reindex(columns=X.columns, fill_value=0)
    
    
    prediction = model.predict(input_df)
    
   
    return "Fraud" if prediction[0] == 1 else "Legit"


def get_user_input():
    
    user_input = {}
    
    for feature in X.columns:
        value = input(f"Enter value for {feature}: ")
        
        
        if value.replace('.', '', 1).isdigit():
            value = float(value)  
        user_input[feature] = value
    
    return user_input

In [20]:
user_input = get_user_input()
result = predict_fraud(user_input)
print(f"The transaction is: {result}")

The transaction is: Fraud
