In [2]:
import os.path
import sqlite3
import cohere
import numpy as np
import os
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from scipy import sparse

[nltk_data] Downloading package stopwords to /home/kojo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
version = '_old'

load_dotenv()

cohere_key = os.getenv("COHERE_API_KEY")
co = cohere.Client(cohere_key)
stop_words = list(stopwords.words('english'))

In [51]:
document_matrix = sparse.load_npz(f'./data/final_embeddings{version}.npz')
with open(f'vectorizer{version}.pk', 'rb') as f:
    vectorizer = pickle.load(f)

In [52]:
query = "Is there any precedent for calling a second-hand witness to the stand?"
query_emb = co.embed([query], input_type="search_query", model="embed-english-v3.0").embeddings
query_emb = np.asarray(query_emb)
query_emb.shape

(1, 1024)

In [53]:
query_tfidf = vectorizer.transform([query])
query_tfidf.shape

(1, 323009)

In [54]:
query_emb[0].shape

(1024,)

In [55]:
query_tfidf[0].toarray()[0].shape

(323009,)

In [56]:
query_matrix = np.concatenate((query_emb[0], query_tfidf[0].toarray()[0]))
query_matrix.shape

(324033,)

In [57]:
document_matrix.T.shape

(324033, 200)

In [58]:
def cosine_sim(vector, matrix):
    scores = []
    for mat in matrix:
        cos_sim = np.dot(vector, mat.toarray()[0])/(np.linalg.norm(vector)*np.linalg.norm(mat.toarray()[0]))
        scores.append(cos_sim)
    return np.asarray(scores)

scores = cosine_sim(query_matrix, document_matrix)
scores.shape

(200,)

In [59]:
max_idx = np.argsort(-scores)

max_idx[:5]

array([148, 165, 137, 169, 174])

In [60]:
def give_list(db, name=''):
	if len(name) != 0:
		res_table = db.execute(f'SELECT * FROM {name}')
		rows = [row[0] for row in res_table]
		return rows
	else:
		rows = [row[0] for row in db]
		return rows

con = sqlite3.connect(f'./data/courtCases{version}.db')
cur = con.cursor()

In [87]:
tuple(ind + 1 for ind in max_idx[:5])

(149, 166, 138, 170, 175)

In [90]:
def give_list(db, name, columns, rows):
    command = f"""
    SELECT {', '.join(columns)} FROM (
        SELECT ROW_NUMBER() OVER (ORDER BY decision_date) AS RowNum, *
        FROM {name} CA
    ) C
    WHERE C.RowNum IN {tuple(ind + 1 for ind in rows)}
    """
    res_table = db.execute(command)
    rows = [' '.join(row) for row in res_table]
    return rows

In [94]:
docs = give_list(cur, 'Cases', ['name', 'content'], max_idx[:20])

In [95]:
results = co.rerank(query=query, documents=docs, top_n=3, model="rerank-english-v2.0")
results

[RerankResult<document['text']: MALLETT v. STATE OF NORTH CAROLINA [181 U.S. 589, 590]   In September, 1898, John P. Mallett and Charles B. Mehegan were indicted and tried in the criminal court of the county of Edgecombe, North Carolina, for conspiracy to defraud. They were convicted and sentenced to two years' imprisonment in the common jail. They appealed to the superior court. The record was certified up by the clerk of the criminal court on April 1, 1899. The superior court reversed the verdict and judgment, and granted a new trial. From this judgment of the superior court the state appealed, on July 7, 1899, to the supreme court, which reversed the judgment of the superior court, and remanded the cause to the criminal court, with directions that the sentence imposed by that court should be carried into execution.

At the time of the commission of the offense, and at the time of the trial in the criminal court of Edgecombe county, the state of North Carolina was not entitled to app

In [98]:
for res in results:
    print(res.index)
    break;

2


In [None]:
[max_idx[res.index] for res in results]

In [None]:
print(f"Query: {query}")
for res in results:
  print(res.document['text'])
  print("--------")

## For Fixing date format issues

In [48]:
def give_list(db, name=''):
	if len(name) != 0:
		res_table = db.execute(f'SELECT * FROM {name}')
		rows = [row[0] for row in res_table]
		return rows
	else:
		rows = [row[0] for row in db]
		return rows

def get_table(db, name):
    table_list = []
    res_table = db.execute(f'SELECT * FROM {name}')
    rows = [[item[0] for item in res_table.description][:-1]] + [row for row in res_table]
    lens = []
    for col in zip(*rows):
        lens.append(max([len(str(v)) for v in col]))
    format = "  ".join(["{:." + str(l) + "}" for l in lens])
    for row in rows:
        row = [str(x) for x in row]
        table_list.append(format.format(*row))
    return table_list[1:]

con = sqlite3.connect(f'courtCases{version}.db')
cur = con.cursor()

In [44]:
command = """
SELECT decision_date
FROM Cases C
"""
res_table = cur.execute(command)
docs = give_list(res_table)
docs

[' January 05, 1903',
 ' January 12, 1970',
 ' April 25, 1977',
 ' October 28, 1957',
 ' January 06, 1964',
 ' January 08, 1962',
 ' December 16, 1974',
 ' November 14, 1892',
 ' April 19, 1897',
 ' February 03, 1947',
 ' October 20, 1980',
 ' June 02, 1975',
 ' October 13, 1924',
 ' February 01, 1886',
 ' January 06, 1902',
 ' June 06, 1966',
 ' April 17, 1893',
 ' January 03, 1938',
 ' October 29, 1894',
 ' May 14, 1900',
 ' February 24, 1931',
 ' April 29, 1986',
 ' June 19, 2008',
 ' April 18, 1887',
 ' May 13, 1991',
 ' June 09, 1904',
 ' January 04, 1943',
 ' April 11, 1955',
 ' March 02, 1964',
 ' June 07, 1912',
 ' December 02, 1907',
 ' January 10, 1955',
 ' May 02, 1988',
 ' February 19, 1923',
 ' April 28, 1987',
 ' May 26, 1885',
 ' April 30, 1962',
 ' April 24, 1990',
 ' January 25, 1999',
 ' May 13, 1907',
 ' November 22, 1943',
 ' December 05, 1977',
 ' March 09, 1987',
 ' May 20, 1991',
 ' December 01, 1865',
 ' May 02, 1938',
 ' June 10, 1963',
 ' February 15, 1932',
 

In [None]:
command = """
SELECT ROW_NUMBER(), content
FROM Cases C
"""
res_table = cur.execute(command)
docs = give_list(res_table)
tableList = get_table(cur, 'Cases')
docs_clean = [doc.replace("\n"," ") for doc in docs]
full_docs = [n + " " + m for n, m in zip(tableList, docs_clean)]

In [34]:
from datetime import datetime

datetime.strptime(' January 05, 1903', ' %B %d, %Y').strftime("%Y-%m-%d")

'1903-01-05'

In [45]:
command = "SELECT decision_date FROM Cases C"
name_list = give_list(cur.execute("SELECT name FROM Cases C"))
citation_list = give_list(cur.execute("SELECT citation FROM Cases C"))
docket_no_list = give_list(cur.execute("SELECT docket_no FROM Cases C"))
decision_date_list = give_list(cur.execute("SELECT decision_date FROM Cases C"))
court_list = give_list(cur.execute("SELECT court FROM Cases C"))
content_list = give_list(cur.execute("SELECT content FROM Cases C"))
docket_no_list

[' No. 273',
 ' No. 722',
 ' No. 76-1036',
 ' No. 399',
 ' No. 717',
 ' No. 482',
 ' No. 74-225',
 None,
 ' No. 214',
 ' No. 208',
 ' No. 79-1964',
 ' No. 73-1256',
 ' No. 115',
 ' No. 47',
 ' No. 251',
 ' No. 439',
 ' No. 200',
 ' No. 48',
 ' No. 641',
 ' No. 218',
 None,
 ' No. 85-5189',
 ' No. 061708',
 ' No. 15',
 ' No. 89-1817',
 ' No. 295',
 ' No. 172',
 ' No. 337',
 ' No. 267',
 ' No. 68',
 ' No. 284',
 ' No. 408',
 ' No. 87-6138',
 ' No. 25',
 ' No. 85-1259',
 ' No. 296',
 ' No. 384',
 ' No. 88-7146',
 ' No. 97-826',
 ' No. 231',
 ' No. 23',
 ' No. 76-1200',
 ' No. 86-5953',
 ' No. 90-6282',
 None,
 ' No. 72',
 ' No. 79',
 ' No. 349',
 None,
 ' No. 701',
 ' No. 281',
 ' No. 85-295',
 ' No. 89',
 ' No. 206',
 ' No. 33',
 ' No. 518',
 ' No. 13',
 ' No. 105',
 ' No. 101210',
 ' No. 112108',
 ' No. 93-1677',
 ' No. 22',
 ' No. 12',
 ' No. 402',
 ' No. 092309',
 ' No. 42',
 ' No. 934',
 ' No. 16',
 ' No. 241',
 ' No. 989',
 ' No. 011314zor',
 None,
 ' No. 339',
 None,
 ' No. 501',
 

In [46]:
con2 = sqlite3.connect('courtCases_new.db')
cur2 = con2.cursor()

### Musicians table

command = """
CREATE TABLE Cases (
    name text,
    citation text,
    docket_no text,
    decision_date date,
    court text,
    content text,
    PRIMARY KEY (citation)
);
"""
cur2.execute(command)
#"""
for name, citation, docket, date, court, content in zip(name_list, citation_list, docket_no_list, decision_date_list, court_list, content_list):
    cur2.execute("INSERT INTO Cases VALUES (?, ?, ?, ?, ?, ?)", 
                (name, citation, docket, datetime.strptime(date, ' %B %d, %Y').strftime("%Y-%m-%d"), court, content))
#"""
con2.commit()


## Large Db Test

In [4]:
version = ''

In [5]:
document_matrix = sparse.load_npz(f'./data/final_embeddings{version}.npz')
with open(f'./data/vectorizer{version}.pk', 'rb') as f:
    vectorizer = pickle.load(f)

In [16]:
document_matrix[8].mean()

0.0

In [None]:
query = "What are the possible charges of stealing food from a grocery store?"
print("Query:", query)
query_emb = co.embed([query], input_type="search_query", model="embed-english-v3.0").embeddings
query_emb = np.asarray(query_emb)
query_tfidf = vectorizer.transform([query])
query_matrix = np.concatenate((query_emb[0], query_tfidf[0].toarray()[0]))

scores = cosine_sim(query_matrix, document_matrix)
max_idx = np.argsort(-scores)
print("Semantic/Term Search top three document indices:", max_idx[:5].tolist())