In [108]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer


In [109]:
file = pd.read_csv('C:\\Users\\mjjyo\\OneDrive\\Desktop\\precedent\\2022_federal_cases.csv')

In [111]:
file

Unnamed: 0.1,Unnamed: 0,title,text
0,0,"Pharmaceutical Research v. Stuart Williams, et al",United States Court of Appeals \nFor The Eigh...
1,1,"Mirabella v. Town of Lexington, MA, et al",United States Court of Appeals \nFor the Fir...
2,2,"Andrade-Prado, Jr. v. Garland",United States Court of Appeals \nFor the Fir...
3,3,USA v. Jose Hodge,NOT PRECEDENTIAL \n \nUNITED STATES COURT OF A...
4,4,W. R. R. v. Attorney General United States,\n NOT PRECEDENTIAL \n \nUNITED STATES COURT...
...,...,...,...
16593,16593,"Nicholas Stover v. Amazon.com, LLC, et al",\n \nNOT RECOMMENDED FOR PUBLICATION \nFile ...
16594,16594,"Randy Coley v. DIRECTV, Inc.",PUBLISHED \n \nUNITED STATES COURT OF APPEALS...
16595,16595,US v. Wincy Joseph,UNPUBLISHED \n \nUNITED STATES COURT OF APPEA...
16596,16596,US v. Antonio Davis,UNPUBLISHED \n \nUNITED STATES COURT OF APPEA...


In [115]:
complete_file = file.drop(['Unnamed: 0'], axis=1)

In [116]:
complete_file

Unnamed: 0,title,text
0,"Pharmaceutical Research v. Stuart Williams, et al",United States Court of Appeals \nFor The Eigh...
1,"Mirabella v. Town of Lexington, MA, et al",United States Court of Appeals \nFor the Fir...
2,"Andrade-Prado, Jr. v. Garland",United States Court of Appeals \nFor the Fir...
3,USA v. Jose Hodge,NOT PRECEDENTIAL \n \nUNITED STATES COURT OF A...
4,W. R. R. v. Attorney General United States,\n NOT PRECEDENTIAL \n \nUNITED STATES COURT...
...,...,...
16593,"Nicholas Stover v. Amazon.com, LLC, et al",\n \nNOT RECOMMENDED FOR PUBLICATION \nFile ...
16594,"Randy Coley v. DIRECTV, Inc.",PUBLISHED \n \nUNITED STATES COURT OF APPEALS...
16595,US v. Wincy Joseph,UNPUBLISHED \n \nUNITED STATES COURT OF APPEA...
16596,US v. Antonio Davis,UNPUBLISHED \n \nUNITED STATES COURT OF APPEA...


In [95]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    final_text = []
    
    for item in text:
        # Tokenize the text
        tokens = word_tokenize(item.lower())

        # Remove stop words and punctuations
        tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

        # Stem the tokens
        tokens = [stemmer.stem(token) for token in tokens]

        # Join the tokens back to form the text
        words = " ".join(tokens)

        final_text.append(words)
    return final_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mjjyo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mjjyo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
# User Query
user_input = "music copyright"

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [99]:
# Vectorize the court case texts
vectorizer = TfidfVectorizer(stop_words='english')
corpus = preprocess_text(complete_file['text'])

In [100]:
# Vectorize the corpus
vectorized_corpus = vectorizer.fit_transform(corpus)

In [101]:
# Vectorize the user input
user_input_vectorized = vectorizer.transform([user_input])

In [102]:
# Calculate the cosine similarity between the user input and the court case texts
similarity = cosine_similarity(user_input_vectorized, vectorized_corpus)

In [103]:
# Get the indices of the top matching court cases
top_matches = similarity.argsort(axis=1)[0][::-1][:10]

In [104]:
# Get the titles of the top matching court cases
titles = complete_file.iloc[top_matches]['title']

print(titles)

2645             Willia Dean Parker, et al v. Sarah Hinton
14229         Marcus Gray, et al v. Katheryn Hudson, et al
86        SAS Institute, Inc. v. World Programming Limited
7015     Joe Hand Promotions, Inc. v. James Griffith, J...
6365                            ABKCO Music, Inc. v. Sagan
6366                            ABKCO Music, Inc. v. Sagan
6364                            ABKCO Music, Inc. v. Sagan
10043                                         US v. Gordon
6326                      Melendez v. Sirius XM Radio Inc.
5355           Unicolors, Inc. v. H&M Hennes & Mauritz, LP
Name: title, dtype: object


In [117]:
# complete_file.iloc[14229]['text']

'FOR PUBLICATION \n \nUNITED STATES COURT OF APPEALS  \nFOR THE NINTH CIRCUIT  \n \n \nMarcus Gray , PKA Flame; \nEMANUEL LAMBERT ; CHIKE \nOJUKWU , \nPlaintiffs -Appellants , \n \nv. \n K\nATHERYN ELIZABETH HUDSON , \nPKA Katy Perry; J ORDAN HOUSTON , \nPKA Juicy J; L UKASZ GOTTWALD , \nPKA Dr. Luke; S ARAH THERESA \nHUDSON ; KARL MARTIN SANDBERG , \nPKA Max Martin; H ENRY RUSSELL \nWALTER , PKA Cirkut; K ASZ \nMONEY , INC.; CAPITOL RECORDS , \nLLC;  WB  MUSIC CORP.; KOBALT \nMUSIC PUBLISHING AMERICA , INC., \nDefendants -Appellees.   No. 20-55401  \n \nD.C. No. \n2:15- cv-05642-\nCAS -JC \n  \nOPINION  \n \nAppeal from the United States District Court \nfor the Central District of California  \nChristina A. Snyder, District Judge, Presiding \n \nArgued and Submitted January 11, 2022  \nPasadena, California  \n \nFiled March 10, 2022  \n Case: 20-55401, 03/10/2022, ID: 12391170, DktEntry: 95-1, Page 1 of 26 2 GRAY V . HUDSON  \n \nBefore:  RICHARD R. CLIFTON, MILAN D. SMITH, \nJR., an