**Lawyer AI Assistant Project**

In [None]:
import nltk
import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Make sure you have NLTK data downloaded for tokenization and stopwords
nltk.download("punkt")
nltk.download("stopwords")

The load_legal_documents function is used to load legal documents from a specified directory.

In [None]:
# Define a function to load legal documents from a directory
def load_legal_documents(directory_path):
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(os.path.join(directory_path, filename), "r", encoding="utf-8") as file:
                document_text = file.read()
                documents.append(document_text)
    return documents

# Load legal documents from a directory (replace 'your_directory' with the actual path)
legal_documents_directory = "your_directory"
legal_documents = load_legal_documents(legal_documents_directory)

The preprocess_documents function tokenizes and preprocesses the loaded legal documents.

In [None]:
# Preprocess and tokenize legal documents
def preprocess_documents(documents):
    processed_documents = []
    for document in documents:
        # Tokenize sentences
        sentences = nltk.sent_tokenize(document)

        # Tokenize words and remove stopwords
        words = [word for word in nltk.word_tokenize(document.lower()) if word.isalnum() and word not in nltk.corpus.stopwords.words("english")]

        # Combine sentences and words
        processed_documents.append(" ".join(sentences + words))
    return processed_documents

processed_legal_documents = preprocess_documents(legal_documents)

The rest of the code for TF-IDF vectorization and document similarity search remains the same as in the previous example.

In [None]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_legal_documents)

# Define a function to search for relevant legal documents
def search_legal_documents(query, tfidf_matrix, documents):
    # Preprocess the query
    processed_query = preprocess_documents([query])[0]

    # Transform the query using the TF-IDF vectorizer
    query_vector = tfidf_vectorizer.transform([processed_query])

    # Calculate cosine similarity between the query and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    # Get the index of the most similar document
    most_similar_index = np.argmax(similarity_scores)

    # Return the most relevant legal document
    return documents[most_similar_index]

# Example query
user_query = "I need information about intellectual property rights."
result = search_legal_documents(user_query, tfidf_matrix, legal_documents)
print(result)
