In [1]:
import pandas as pd
import glob
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import joblib

In [2]:
directory_path = 'C:/Users/clown/OneDrive/Documents/College Stuff/Projects/Judicial Analytics/Data/Justice data/csv/cases/*.csv'

dfs = []
for file in glob.glob(directory_path):
    # Read the file in chunks and append each chunk to the list
    chunk_iter = pd.read_csv(file, chunksize=100000)
    for chunk in chunk_iter:
        dfs.append(chunk)

combined_df = pd.concat(dfs, ignore_index=True)



In [3]:
del dfs
combined_df.head(5)

Unnamed: 0,ddl_case_id,year,state_code,dist_code,court_no,cino,judge_position,female_defendant,female_petitioner,female_adv_def,female_adv_pet,type_name,purpose_name,disp_name,date_of_filing,date_of_decision,date_first_list,date_last_list,date_next_list
0,01-01-01-200308002162010,2010,1,1,1,MHNB030013812010,chief judicial magistrate,0 male,1 female,0,-9998,790.0,5228.0,42,2010-12-13,2011-06-19,2011-06-08,2011-06-20,2011-06-24
1,01-01-01-200707000172010,2010,1,1,1,MHNB030004552010,chief judicial magistrate,-9998 unclear,1 female,-9999,0,2587.0,3627.0,42,2010-02-25,2010-11-21,2010-08-06,2010-08-06,2010-11-30
2,01-01-01-200707000182010,2010,1,1,1,MHNB030004562010,chief judicial magistrate,-9998 unclear,-9998 unclear,-9999,0,2587.0,3627.0,42,2010-02-25,2010-11-21,2010-08-06,2010-08-06,2010-11-30
3,01-01-01-200707000192010,2010,1,1,1,MHNB030004582010,chief judicial magistrate,-9998 unclear,1 female,-9999,0,2587.0,3627.0,42,2010-02-25,2010-11-21,2010-08-06,2010-08-06,2010-11-30
4,01-01-01-200707000202010,2010,1,1,1,MHNB030004592010,chief judicial magistrate,-9998 unclear,-9998 unclear,-9999,0,2587.0,3627.0,42,2010-02-25,2010-11-21,2010-08-06,2010-08-06,2010-11-30


In [4]:
combined_df["Decision Time"] = (pd.to_datetime(combined_df['date_of_decision'],errors='coerce') - pd.to_datetime(combined_df['date_of_filing'],errors='coerce')).dt.days
combined_df["Decision Time"]

0           188.0
1           269.0
2           269.0
3           269.0
4           269.0
            ...  
80935939     48.0
80935940    210.0
80935941    203.0
80935942    372.0
80935943     92.0
Name: Decision Time, Length: 80935944, dtype: float64

## Preprocessing 

In [5]:
columns_to_drop = ['ddl_case_id', 'court_no', 'cino','date_of_filing','date_of_decision','date_first_list','date_last_list','date_next_list','female_defendant', 'female_petitioner', 'female_adv_def','female_adv_pet','dist_code','year']
# Drop the columns from the DataFrame
combined_df.drop(columns=columns_to_drop, inplace=True)

In [6]:
combined_df.head(5)

Unnamed: 0,state_code,judge_position,type_name,purpose_name,disp_name,Decision Time
0,1,chief judicial magistrate,790.0,5228.0,42,188.0
1,1,chief judicial magistrate,2587.0,3627.0,42,269.0
2,1,chief judicial magistrate,2587.0,3627.0,42,269.0
3,1,chief judicial magistrate,2587.0,3627.0,42,269.0
4,1,chief judicial magistrate,2587.0,3627.0,42,269.0


In [7]:
combined_df.dropna(axis=0,inplace=True)

In [8]:
print(len(combined_df))
combined_df.isnull().sum()

57287630


state_code        0
judge_position    0
type_name         0
purpose_name      0
disp_name         0
Decision Time     0
dtype: int64

In [9]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57287630 entries, 0 to 80935943
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   state_code      int64  
 1   judge_position  object 
 2   type_name       float64
 3   purpose_name    float64
 4   disp_name       int64  
 5   Decision Time   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 3.0+ GB


## Data Encryption 

In [10]:
import secrets
key = secrets.token_bytes(16)  # Generates a 16-byte (128-bit) random key
print(key)

b'\x08\x03\xd4\xb0\xc3q|\x96\xde\x0bTc\xbc\x82\xe4&'


In [None]:
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.backends import default_backend

# Function to pad data to a multiple of AES block size
def pad_data(data):
    block_size = algorithms.AES.block_size // 8
    padding_length = block_size - (len(data) % block_size)
    padding = bytes([padding_length] * padding_length)
    return data + padding

# Function to encrypt data using AES
def encrypt_data(key, data):
    cipher = Cipher(algorithms.AES(key), modes.ECB(), backend=default_backend())
    encryptor = cipher.encryptor()
    return encryptor.update(pad_data(data)) + encryptor.finalize()

# Function to encrypt each cell in a DataFrame
def encrypt_dataframe(df, key):
    return df.applymap(lambda cell: encrypt_data(key, str(cell).encode('utf-8')))


x1 = combined_df.drop(['Decision Time'], axis=1).copy()  

# Convert DataFrame to bytes before applying encryption
X_bytes = x1.applymap(lambda cell: str(cell).encode('utf-8'))

encrypted_X = encrypt_dataframe(x1, key)

# Function to convert bytes to integer
def base64_to_int(df):
    return int.from_bytes(df, byteorder='big')  # or 'little' depending on your byte order

# Apply the conversion to the entire DataFrame
encrypted_X = encrypted_X.applymap(base64_to_int)

## Model Building

In [11]:
# Step 1: Perform ordinal encoding for the 'judge_position' variable
ordinal_encoder = OrdinalEncoder()
combined_df['judge_position'] = ordinal_encoder.fit_transform(combined_df[['judge_position']])

In [12]:
# Taking a random sample of 10% of the rows
Train_Sample = combined_df.sample(frac=0.3, random_state=42)
r2 = 20

# Resetting the index of the sampled DataFrame
Train_Sample.reset_index(drop=True, inplace=True)

In [13]:
# Step 1: Split into features (X) and target variable (y)
X = Train_Sample.drop(columns=['Decision Time'])  # Replace 'target_column' with your target variable
y = Train_Sample['Decision Time']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print(type(X_test))
X_test.info()

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Index: 3437258 entries, 14381359 to 6113798
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   state_code      int64  
 1   judge_position  float64
 2   type_name       float64
 3   purpose_name    float64
 4   disp_name       int64  
dtypes: float64(3), int64(2)
memory usage: 157.3 MB


In [15]:
# Step 4: Initialize Random Forest Regression model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 5: Fit the model to the training data
rf_regressor.fit(X_train, y_train)

In [16]:
from sklearn.metrics import r2_score

# Step 6: Evaluate the model's performance
y_pred = rf_regressor.predict(X_test)
r2 += r2_score(y_test, y_pred)*100
print(f"R-squared (R2) Score: {r2}")

R-squared (R2) Score: 82.47642661064606


In [17]:
from joblib import dump

# Save the trained model to a file using Joblib
dump(rf_regressor, 'Judicial_Data_RFmodel.pkl')

['Judicial_Data_RFmodel.pkl']

## Code Dump - Test



In [None]:
from joblib import load
import pandas as pd
import numpy as np
# Define the columns and their respective data types
loaded_model = load('Judicial_Data_RFmodel.pkl')
columns = {'state_code': int,'judge_position': float,'type_name': float,'purpose_name': float,'disp_name': int
}

# Create an empty DataFrame with the specified columns
new_df = pd.DataFrame(columns=columns.keys())

no_rows = 1
# Generate random data for each column
random_state_codes = np.random.randint(0, 100,size=no_rows)
random_judge_positions = float(np.random.randint(0, 100,size=no_rows))
random_type_names = float(np.random.randint(0, 100,size=no_rows))
random_purpose_names = float(np.random.randint(0, 100,size=no_rows))
random_disp_names = np.random.randint(0, 100,size=no_rows)

# Add the random data to the DataFrame
new_df['state_code'] = random_state_codes
new_df['judge_position'] = random_judge_positions
new_df['type_name'] = random_type_names
new_df['purpose_name'] = random_purpose_names
new_df['disp_name'] = random_disp_names

predictions = loaded_model.predict(new_df)

# Print the predictions
print("Predictions:", predictions)
