In [9]:

# Step 1: Install required libraries
# %pip install matplotlib seaborn

# Step 2: Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Step 3: Load Training Dataset from JSON
import json

with open('resources/training_data.json', 'r') as f:
    training_data = json.load(f)

# Step 4: Convert to DataFrame
df = pd.DataFrame(training_data['training_data'])

# Step 5: Split Dataset (optional for testing accuracy, skipping here)
X = df["text"]
y = df["type"]

# Step 6: Build the Model Pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=1000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto'))
])

# Step 7: Train the Model
pipeline.fit(X, y)

print("✅ Model training completed successfully using array-based dataset.")



✅ Model training completed successfully using array-based dataset.




In [None]:

# Step 6: Predict on Test Set using extracted dataset

# X_test → Extracted Texts from the new document
# y_test → Corresponding Labels from the new document
# test case:"id": "6c3ba11d-1d4c-4213-8560-ea1b6d4a035e",
X_test = [
    "The Governor acted within his discretionary executive authority under Article 154C of the Constitution when interdicting the Petitioner.",
    "The interdiction and initiation of a preliminary inquiry by the Governor were justified under the constitutional and statutory framework during the absence of a functioning Provincial Council.",
    "The Court does not have jurisdiction to interpret the Constitution under Article 125; such power lies solely with the Supreme Court.",
    "Challenging the charge sheet on the basis that it was issued by the Governor is erroneous, as it was issued under the authority of the Board of Management.",


    "The Southern Provincial Council was dissolved on 01.04.2019, and the Office of the Chief Minister became defunct.",
    "There was no functioning Board of Management or subject Minister to take disciplinary action regarding the SPDA.",
    "An internal audit report (R1) indicated the Petitioner had committed serious financial misconduct and violated financial regulations.",
    "The Governor, under Article 154C and 154F and Section 27A of the Provincial Councils Act, holds residual executive power to ensure uninterrupted administration when the Provincial Council ceases to function.",
    "Article 154B(11) allows the Governor to call for information relating to provincial administration, which supports supervisory authority.",
    "Section 24 of the SPDA Statute No. 01 of 1995 normally vests disciplinary power in the Board of Management, but that Board was non-existent due to the council’s dissolution.",
    "The marking scheme P10 received by the Petitioner was a brief version, whereas R4 was a detailed scheme.",
    "The Governor relied on the audit report findings to direct a preliminary inquiry and issue the letter of interdiction.",


    "Names and addresses of the Petitioner and 16 Respondents, including SPDA Board members, former and current Governors.",
    "Counsel: Sanjeewa Jayawardane PC for the Petitioner; Saman Galapaththi and Manohara Jayasinghe DSG for the Respondents.",
    "Case procedural info: Argued on 13.06.2023, submissions filed on 24.12.2023 and 26.09.2023, decided on 09.01.2024.",
    "Excerpts and reproductions of constitutional provisions (Articles 154B, 154C, 154F, and 125) and Provincial Councils Act Section 27A.",
    "Judges: M.T. Mohammed Laffar, J. and S.U.B. Karalliyadde, J.",
    "Procedural description of writs requested: Certiorari, Mandamus, Prohibition (total of 10 reliefs prayed for)."
]

y_test = [
    "Claim", "Claim", "Claim", "Claim", 
    "Premise", "Premise", "Premise", "Premise", "Premise", "Premise", "Premise", "Premise",
    "Non-Argumentative", "Non-Argumentative", "Non-Argumentative", "Non-Argumentative", "Non-Argumentative", "Non-Argumentative"
]

# Step 7: Predict
y_pred = pipeline.predict(X_test)

# Step 8: Evaluation
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# Step 10: Optional Display Predictions
print("\n Individual Predictions:")
for sentence, pred in zip(X_test, y_pred):
    print(f"➡ \"{sentence[:80]}...\" ➝ Predicted: {pred}")


Classification Report:


ValueError: Found input variables with inconsistent numbers of samples: [23, 18]

In [None]:
#Multi-label Classification
# Step 1: Install dependencies
%pip install -q sentence-transformers scikit-learn pandas

# Step 2: Import Libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

# Step 3: Sample Multi-label Data (You can replace this with your real dataset later)
data = {
    "sentence": [
        "According to Section 21, the rule applies.",
        "Because the defendant was not present at the scene.",
        "The court believes the plaintiff acted in good faith.",
        "However, the plaintiff's claim contradicts the evidence.",
        "The judge finally ordered compensation to the tenant."
    ],
    "labels": [
        ["Legal Principle"],
        ["Factual Evidence"],
        ["Factual Evidence", "Judgment"],
        ["Counter-Argument"],
        ["Judgment"]
    ]
}

df = pd.DataFrame(data)

# Step 4: Binarize the multi-labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

# Step 5: Load Sentence-BERT or LegalBERT embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can replace with LegalBERT if needed
X = model.encode(df['sentence'].tolist())

# Step 6: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 7: Train Multi-label Classifier (Logistic Regression with sigmoid)
classifier = MultiOutputClassifier(LogisticRegression())
classifier.fit(X_train, y_train)

# Step 8: Predict and Evaluate
y_pred = classifier.predict(X_test)
print("Classification Report (per label):\n")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# Step 9: Predict new sentence
new_sentence = ["The defendant violated the law according to Section 5."]
new_embed = model.encode(new_sentence)
predicted_labels = mlb.inverse_transform(classifier.predict(new_embed))
print("\nPredicted Argument Type(s):", predicted_labels[0])
