In [25]:
import json
import os
import numpy as np
import pandas as pd
from PIL import Image
from math import atan2, degrees
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score

In [26]:
json_path = 'filtered-data.json'
image_folder = 'Images'

In [27]:
with open('filtered-data.json', 'r') as file:
    json_data = json.load(file)

In [28]:
def get_image_size(image_path):
    with Image.open(image_path) as img:
        return img.size

In [29]:
def calculate_slope(x_coords, y_coords):
    if len(x_coords) >= 2 and len(y_coords) >= 2:
        return degrees(atan2(y_coords[1] - y_coords[0], x_coords[1] - x_coords[0]))
    return 0

In [30]:
# Extract numeric features including X_Y coordinates
def extract_features_with_coordinates(json_data, image_folder):
    feature_list = []

    for doc_id, doc_info in json_data.items():
        image_path = os.path.join(image_folder, doc_info['path']) + '.jpeg'
        if not os.path.exists(image_path):
            continue
        
        # Get image size
        image_width, image_height = get_image_size(image_path)
        
        label_dict = doc_info['llm']['response']
        category_mapping = {v: k for k, v in label_dict.items()}

        
        for text, coords in doc_info['ocr'].items():
            x_coords = np.array([coord['x']  for coord in coords])
            y_coords = np.array([coord['y']  for coord in coords])
            
            # Calculate width, height, and slope
            width = np.ptp(x_coords)  # np.ptp gives the range (max - min)
            height = np.ptp(y_coords)
            slope = calculate_slope(x_coords, y_coords)
            
            # Append features, including X_Y coordinates
            feature_list.append({
                'document_type': doc_info['type'],
                'min_x': np.min(x_coords),
                'max_x': np.max(x_coords),
                'min_y': np.min(y_coords),
                'max_y': np.max(y_coords),
                'width': width,
                'height': height,
                'slope': slope,
                'image_width': image_width,
                'image_height': image_height,
                'text': text,
                'category' : category_mapping.get(text,'unknown')
            })
    
    return pd.DataFrame(feature_list)  # Return a Pandas DataFrame for easier manipulation



In [None]:
features_df = extract_features_with_coordinates(json_data, image_folder)
features_df.head()

In [10]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   document_type  384 non-null    object 
 1   min_x          384 non-null    float64
 2   max_x          384 non-null    float64
 3   min_y          384 non-null    float64
 4   max_y          384 non-null    float64
 5   width          384 non-null    float64
 6   height         384 non-null    float64
 7   slope          384 non-null    float64
 8   image_width    384 non-null    int64  
 9   image_height   384 non-null    int64  
 10  text           384 non-null    object 
 11  category       384 non-null    object 
dtypes: float64(7), int64(2), object(3)
memory usage: 36.1+ KB


In [32]:
features_df.to_csv('extracted_features_1.csv', index=False)

### Making model for category classificaion

In [33]:
data = pd.read_csv('extracted_features_1.csv')

In [34]:
data['category'].value_counts()

unknown                    344
english_surname              5
english_document_number      5
english_gender               4
english_first_name           3
nationality                  3
country                      3
english_dob                  3
native_first_name            2
native_surname               2
place_of_birth               2
middle_name                  2
english_expiry_date          1
donor                        1
country_code                 1
english_issue_date           1
country_of_stay              1
version                      1
Name: category, dtype: int64

In [35]:
# Encode categorical variables
label_encoder_doc_type = LabelEncoder()
label_encoder_category = LabelEncoder()
data['document_type'] = label_encoder_doc_type.fit_transform(data['document_type'])
data['category'] = label_encoder_category.fit_transform(data['category'])

In [36]:
label_encoder_doc_type.classes_

array(['botswana_none_idcard_design1', 'chile_none_idcard_design1', 'estonia_none_idcard_design2', 'japan_none_drivinglicense_design1', 'newzealand_none_drivinglicense_design2', 'nigeria_none_passport_design3', 'pakistan_none_idcard_design2'], dtype=object)

In [37]:
# Select features
X = data.drop(['text','category'],axis=1)
y = data['category']

In [38]:
# Normalize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [40]:
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
#print(classification_report(y_test, y_pred, target_names=label_encoder_category.classes_))

Accuracy: 0.8961038961038961


In [41]:
# saving model for later use
import joblib
joblib.dump(label_encoder_doc_type, 'label_encoder_doc_type.pkl')
joblib.dump(label_encoder_category, 'label_encoder_category.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(model, 'classification_model.pkl')

['classification_model.pkl']

In [42]:
def predict_single(features_list):
    # Extract the document_type from the list and encode it
    document_type = features_list[0]
    document_type_encoded = label_encoder_doc_type.transform([document_type])[0]
    
    # Replace the document_type in the list with its encoded value
    features_list[0] = document_type_encoded
    
    # Convert the list to a numpy array and reshape it for the scaler
    features_array = np.array(features_list).reshape(1, -1)
    
    # Scale the features
    features_scaled = scaler.transform(features_array)
    
    # Make a prediction
    prediction = model.predict(features_scaled)
    
    # Decode the predicted label
    predicted_category = label_encoder_category.inverse_transform(prediction)[0]
    
    # Print the predicted category name
    print(f"Predicted Category: {predicted_category}")


In [43]:
# Example usage
example_features = list(features_df.iloc[4])[:-2]
predict_single(example_features)

Predicted Category: country


### predicting doctype from the data

In [None]:
X = features_df[['width', 'height', 'slope', 'relative_width', 'relative_height']]
y = features_df['document_type']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))