In [103]:
import numpy as np

In [104]:
import pandas as pd

In [105]:
import joblib

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix

In [107]:
df = pd.read_csv("transactions.csv")

In [None]:
from ydata_profiling import ProfileReport

# Create the profile report
profile = ProfileReport(df, title="Transaction Categorization Report", explorative=True)

# Save it to an HTML file
profile.to_file("transaction_categorization_report.html")

# Optional: display in notebook (if using Jupyter)
# profile.to_notebook_iframe()


In [108]:
print(df.shape)

(400000, 11)


In [109]:
df.head(10)

Unnamed: 0,transaction_id,user_id,name,transaction_date,category,amount,payment_method,location,age,sex,transaction_details
0,1,U0229,Ashok Basnet,2024-12-17 08:13:14,Groceries,1411.4,eSewa,Janakpur,27,Male,User Ashok Basnet made a Groceries transaction...
1,2,U0210,Sita Lama,2025-02-27 22:22:23,Mobile Recharge,4466.29,Cash,Pokhara,56,Female,User Sita Lama made a Mobile Recharge transact...
2,3,U0865,Rina Shrestha,2024-12-21 19:49:16,Groceries,513.79,eSewa,Janakpur,24,Female,User Rina Shrestha made a Groceries transactio...
3,4,U0055,Rina Thapa,2025-05-15 20:40:05,Healthcare,3594.3,Debit Card,Janakpur,41,Female,User Rina Thapa made a Healthcare transaction ...
4,5,U0860,Manisha Shrestha,2025-05-07 19:02:59,Utilities,2966.87,Debit Card,Butwal,57,Female,User Manisha Shrestha made a Utilities transac...
5,6,U0014,Sita Basnet,2024-12-24 23:44:18,Travel,840.31,Khalti,Janakpur,42,Female,User Sita Basnet made a Travel transaction of ...
6,7,U0570,Sandhya Adhikari,2025-01-13 20:12:54,Utilities,4788.2,Khalti,Nepalgunj,59,Female,User Sandhya Adhikari made a Utilities transac...
7,8,U0190,Kabita Thapa,2025-02-22 04:12:21,Dining,1826.95,Khalti,Birgunj,22,Male,User Kabita Thapa made a Dining transaction of...
8,9,U0542,Bimal Rai,2024-12-06 11:39:01,Clothing,3662.17,IME Pay,Dharan,53,Female,User Bimal Rai made a Clothing transaction of ...
9,10,U0776,Puja Gurung,2025-02-24 22:11:29,Groceries,1501.23,Bank Transfer,Bharatpur,33,Female,User Puja Gurung made a Groceries transaction ...


In [110]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [111]:
df.drop(columns=['name', 'transaction_id', 'transaction_details', 'user_id'], inplace=True)


In [112]:
df.head()

Unnamed: 0,transaction_date,category,amount,payment_method,location,age,sex
0,2024-12-17 08:13:14,Groceries,1411.4,eSewa,Janakpur,27,Male
1,2025-02-27 22:22:23,Mobile Recharge,4466.29,Cash,Pokhara,56,Female
2,2024-12-21 19:49:16,Groceries,513.79,eSewa,Janakpur,24,Female
3,2025-05-15 20:40:05,Healthcare,3594.3,Debit Card,Janakpur,41,Female
4,2025-05-07 19:02:59,Utilities,2966.87,Debit Card,Butwal,57,Female


In [113]:
df[['category', 'payment_method', 'location']].nunique()

category          12
payment_method     7
location          12
dtype: int64

In [114]:
{col: df[col].unique().tolist() for col in ['category', 'payment_method', 'location']}


{'category': ['Groceries',
  'Mobile Recharge',
  'Healthcare',
  'Utilities',
  'Travel',
  'Dining',
  'Clothing',
  'Education',
  'Entertainment',
  'Transportation',
  'Rent',
  'Others'],
 'payment_method': ['eSewa',
  'Cash',
  'Debit Card',
  'Khalti',
  'IME Pay',
  'Bank Transfer',
  'Credit Card'],
 'location': ['Janakpur',
  'Pokhara',
  'Butwal',
  'Nepalgunj',
  'Birgunj',
  'Dharan',
  'Bharatpur',
  'Itahari',
  'Hetauda',
  'Kathmandu',
  'Lalitpur',
  'Biratnagar']}

In [115]:
# Step 1: Convert transaction_date to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Step 2: Extract date components
df['year'] = df['transaction_date'].dt.year
df['month'] = df['transaction_date'].dt.month
df['day'] = df['transaction_date'].dt.day
df['hour'] = df['transaction_date'].dt.hour
df['weekday'] = df['transaction_date'].dt.weekday

In [116]:
# Step 4: Define features and target
X = df[['age', 'amount', 'payment_method', 'location', 'year', 'month', 'day','hour', 'weekday', 'sex']]
y = df['category']  # food, rent, etc.

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [118]:
# Step 5: Define preprocessing
numeric_features = ['age', 'amount', 'year', 'month', 'day', 'hour', 'weekday']
categorical_features = ['payment_method', 'location', 'sex']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])
pipeline = Pipeline([
    ('prep', preprocessor),
    ('clf', LogisticRegression(solver='lbfgs', max_iter=1000))
])

In [119]:
from sklearn import set_config
set_config(transform_output="pandas")

X_encoded = preprocessor.fit_transform(X_train)
print(X_encoded.head())

        num__age  num__amount  num__year  num__month  num__day  num__hour  \
91302  -0.327822     0.825801   0.855321   -1.248914 -0.878557  -0.506945   
381793 -0.781805    -0.198884  -1.169152    1.390406 -1.220317   0.502701   
179491 -0.327822     1.211886   0.855321   -0.984982  0.488485   0.935407   
50621   1.488110    -0.141542   0.855321   -0.984982 -0.878557   0.791171   
91440   1.336783    -0.205792   0.855321   -0.193186 -0.422876   1.223877   

        num__weekday  cat__payment_method_Bank Transfer  \
91302      -0.493297                                0.0   
381793     -0.993514                                0.0   
179491      0.006920                                1.0   
50621       1.007354                                0.0   
91440      -1.493731                                0.0   

        cat__payment_method_Cash  cat__payment_method_Credit Card  ...  \
91302                        0.0                              0.0  ...   
381793                       0.0  

In [120]:
# Step:Train
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [121]:
X_train.head()

Unnamed: 0,age,amount,payment_method,location,year,month,day,hour,weekday,sex
91302,36,3724.31,eSewa,Butwal,2025,1,8,8,2,Male
381793,30,2264.72,Khalti,Itahari,2024,11,5,15,1,Male
179491,36,4274.26,Bank Transfer,Bharatpur,2025,2,20,18,3,Female
50621,60,2346.4,Khalti,Nepalgunj,2025,2,8,17,5,Male
91440,58,2254.88,IME Pay,Nepalgunj,2025,5,12,20,0,Female


In [122]:
X_train.shape

(320000, 10)

In [123]:
# Optional: Evaluate
accuracy = pipeline.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.12


In [124]:
joblib.dump(pipeline, 'trained_logisticRegressionModel.pkl')
print("Model saved succesfully Manish")

Model saved succesfully Manish


In [125]:
new_data = pd.DataFrame([{
    'payment_method': 'eSewa',
    'location': 'Kathmandu',
    'sex': 'Male',
    'age': 20,
    'year': 2025,
    'month': 5,
    'day': 24,
    'hour': 15,
    'weekday': 5,
    'amount':3700
    
}])

result = pipeline.predict(new_data)
print(result)


['Groceries']


In [126]:
import joblib
joblib.dump(pipeline, 'trained_logisticRegressionModel.pkl', compress = 3)


['trained_logisticRegressionModel.pkl']

In [127]:
#import joblib

# Load the model
model = joblib.load('trained_logisticRegressionModel.pkl')

# Example input (must match the structure used in training)
new_data = pd.DataFrame([{
    'payment_method': 'eSewa',
    'location': 'Janakpur',
    'sex': 'Female',
    'age': 30,
    'year': 2025,
    'month': 5,
    'day': 24,
    'hour': 15,
    'weekday': 5,
    'amount':3500
}])

# Make prediction
prediction = model.predict(new_data)
print("Predicted Category:", prediction[0])

Predicted Category: Groceries


In [128]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


                 precision    recall  f1-score   support

       Clothing       0.07      0.01      0.01      5789
         Dining       0.08      0.01      0.02      7222
      Education       0.08      0.01      0.02      5330
  Entertainment       0.09      0.01      0.02      6638
      Groceries       0.12      0.90      0.22      9392
     Healthcare       0.08      0.02      0.03      5578
Mobile Recharge       0.07      0.00      0.00      7301
         Others       0.07      0.01      0.02      5702
           Rent       0.07      0.02      0.03      7495
 Transportation       0.10      0.02      0.04      7250
         Travel       0.09      0.02      0.03      4959
      Utilities       0.11      0.03      0.04      7344

       accuracy                           0.12     80000
      macro avg       0.09      0.09      0.04     80000
   weighted avg       0.09      0.12      0.05     80000

