In [1]:
import pandas as pd
import numpy as np

In [4]:
path="C:/Users/manas/my-project/cleaned_dataset.csv"
df=pd.read_csv(path)
print(df.head())

         id                                               name  \
0  12532561  Jessica OLSEN, on behalf of herself and all ot...   
1  12532560  AMERICAN CENTER FOR LAW AND JUSTICE, Plaintiff...   
2  12532562  UNITED STATES of America, Plaintiff, v. Steven...   
3  12532563  Kaori STEARNEY, et al., Plaintiffs, v. UNITED ...   
4  12532564  IN RE: GERMAN AUTOMOTIVE MANUFACTURERS ANTITRU...   

                                   name_abbreviation decision_date  \
0                              Olsen v. Nelnet, Inc.    2019-05-21   
1  Am. Ctr. for Law & Justice v. U.S. Dep't of Ju...    2019-06-30   
2                             United States v. Emery    2019-06-21   
3                          Stearney v. United States    2019-05-16   
4          In re German Auto. Mfrs. Antitrust Litig.    2019-06-17   

                    docket_number  first_page  last_page citations/0/type  \
0                    4:18-CV-3081        1006       1023         official   
1  Civil Action No. 16-2188 

In [5]:
print(df.tail())

          id                                               name  \
84  12532643  WORLD WATER WORKS HOLDINGS, INC., Plaintiff, v...   
85  12532644  Caitlin BERNARD M.D., Plaintiff, v. INDIVIDUAL...   
86  12532645  REXING QUALITY EGGS, Plaintiff, v. REMBRANDT E...   
87  12532646  Osha JOSEPH, Plaintiff, v. Sgt. Bobby DONAHUE,...   
88  12532647  ZURICH AMERICAN INSURANCE COMPANY, Plaintiff, ...   

                                    name_abbreviation decision_date  \
84  World Water Works Holdings, Inc. v. Cont'l Cas...    2019-06-24   
85  Bernard v. Individual Members of the Ind. Med....    2019-06-28   
86                    Eggs v. Rembrandt Enters., Inc.    2019-05-29   
87                                  Joseph v. Donahue    2019-05-28   
88          Zurich Am. Ins. Co. v. Ins. Co. of N. Am.    2019-05-21   

                docket_number  first_page  last_page citations/0/type  \
84             No. 17 CV 5237         923        935         official   
85  No. 1:19-cv-01660-SE

In [7]:
print(df.columns)

Index(['id', 'name', 'name_abbreviation', 'decision_date', 'docket_number',
       'first_page', 'last_page', 'citations/0/type', 'citations/0/cite',
       'court/name_abbreviation',
       ...
       'cites_to/41/pin_cites/10/page', 'cites_to/41/pin_cites/11/page',
       'cites_to/41/pin_cites/12/page', 'cites_to/41/pin_cites/13/page',
       'cites_to/41/pin_cites/14/page',
       'cites_to/13/pin_cites/5/parenthetical', 'cites_to/13/pin_cites/8/page',
       'cites_to/13/pin_cites/9/page', 'cites_to/13/pin_cites/10/page',
       'cites_to/13/pin_cites/11/page'],
      dtype='object', length=2685)


In [8]:
# Step 1: Preprocessing
# We'll keep a subset of important columns for generating the legal draft
selected_columns = [
    "name",  # Use as proxy for target output (legal draft)
    "name_abbreviation",  # Short form of case name
    "decision_date",
    "docket_number"
]

# Include court name if present
court_columns = [col for col in df.columns if "court/name_abbreviation" in col]
selected_columns.extend(court_columns)

# Subset the DataFrame
df_subset = df[selected_columns].dropna()

# Rename columns for easier handling
df_subset = df_subset.rename(columns={
    "name": "target_text",
    "name_abbreviation": "case_title",
    "decision_date": "date",
    "docket_number": "docket",
    court_columns[0]: "court"
})

# Preview the cleaned data
df_subset.head()


Unnamed: 0,target_text,case_title,date,docket,court
0,"Jessica OLSEN, on behalf of herself and all ot...","Olsen v. Nelnet, Inc.",2019-05-21,4:18-CV-3081,D. Neb.
1,"AMERICAN CENTER FOR LAW AND JUSTICE, Plaintiff...",Am. Ctr. for Law & Justice v. U.S. Dep't of Ju...,2019-06-30,Civil Action No. 16-2188 (TJK),D.D.C.
2,"UNITED STATES of America, Plaintiff, v. Steven...",United States v. Emery,2019-06-21,3:18-CR-30122-RAL,D. S.D.
3,"Kaori STEARNEY, et al., Plaintiffs, v. UNITED ...",Stearney v. United States,2019-05-16,No. CV16-8060-PCT-DGC,D. Ariz.
4,IN RE: GERMAN AUTOMOTIVE MANUFACTURERS ANTITRU...,In re German Auto. Mfrs. Antitrust Litig.,2019-06-17,MDL No. 2796 CRB (JSC),N.D. Cal.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: Combine metadata fields into one input string
df_subset['input_text'] = df_subset.apply(
    lambda row: f"{row['case_title']} | {row['date']} | {row['docket']} | {row['court']}", axis=1
)

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df_subset['input_text'], df_subset['target_text'], test_size=0.2, random_state=42
)

# Step 3: Vectorize inputs using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 4: Encode targets with TF-IDF as well (since target is text, this is a rough proxy)
target_vectorizer = TfidfVectorizer(max_features=5000)
y_train_tfidf = target_vectorizer.fit_transform(y_train)
y_test_tfidf = target_vectorizer.transform(y_test)

# Step 5: Train a baseline linear regression model to predict target vectors
model = LinearRegression()
model.fit(X_train_tfidf, y_train_tfidf.toarray())

# Step 6: Predict and evaluate
y_pred = model.predict(X_test_tfidf)
mse = mean_squared_error(y_test_tfidf.toarray(), y_pred)
print(f"Baseline Model Mean Squared Error: {mse:.4f}")


Baseline Model Mean Squared Error: 0.0018


In [14]:
# Re-import necessary packages after execution state reset
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



# Select and rename relevant columns
court_columns = [col for col in df.columns if "court/name_abbreviation" in col]
df_subset = df[[
    "name", "name_abbreviation", "decision_date", "docket_number"
] + court_columns].dropna()

df_subset = df_subset.rename(columns={
    "name": "target_text",
    "name_abbreviation": "case_title",
    "decision_date": "date",
    "docket_number": "docket",
    court_columns[0]: "court"
})

# Combine input features
df_subset['input_text'] = df_subset.apply(
    lambda row: f"{row['case_title']} | {row['date']} | {row['docket']} | {row['court']}", axis=1
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df_subset['input_text'], df_subset['target_text'], test_size=0.2, random_state=42
)

# Tokenization and padding
MAX_VOCAB = 5000
MAX_LEN = 30

input_tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
target_tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")

input_tokenizer.fit_on_texts(X_train)
target_tokenizer.fit_on_texts(y_train)

X_train_seq = input_tokenizer.texts_to_sequences(X_train)
X_test_seq = input_tokenizer.texts_to_sequences(X_test)
y_train_seq = target_tokenizer.texts_to_sequences(y_train)
y_test_seq = target_tokenizer.texts_to_sequences(y_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')
y_train_pad = pad_sequences(y_train_seq, maxlen=MAX_LEN, padding='post')
y_test_pad = pad_sequences(y_test_seq, maxlen=MAX_LEN, padding='post')

X_train_pad.shape, y_train_pad.shape


((71, 30), (71, 30))

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Define model parameters
VOCAB_SIZE = 5000
EMBEDDING_DIM = 64
LSTM_UNITS = 128
MAX_LEN = 30

# Encoder
encoder_inputs = Input(shape=(MAX_LEN,))
encoder_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(LSTM_UNITS, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(MAX_LEN,))
decoder_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=False)
decoder_outputs = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare decoder target data (shifted output)
import numpy as np
y_train_target = np.expand_dims(y_train_pad, -1)

# Train the model
model.fit(
    [X_train_pad, y_train_pad],
    y_train_target,
    batch_size=16,
    epochs=10,
    validation_split=0.2
)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 184ms/step - accuracy: 0.0310 - loss: 8.5157 - val_accuracy: 0.1022 - val_loss: 8.5098
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.1054 - loss: 8.5040 - val_accuracy: 0.0511 - val_loss: 8.4962
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.0429 - loss: 8.4776 - val_accuracy: 0.0289 - val_loss: 8.4338
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.0353 - loss: 8.3244 - val_accuracy: 0.0289 - val_loss: 8.0449
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.0341 - loss: 7.6683 - val_accuracy: 0.0289 - val_loss: 7.5726
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.0348 - loss: 6.9189 - val_accuracy: 0.0289 - val_loss: 7.1808
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x15b18da3890>