In [1]:
import pandas as pd
import numpy as np
import re
import string

print("Loading Text Data...")
try:
    df_fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
    df_true = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
except FileNotFoundError:
    print(" Error: Dataset not found. Please add the 'Fake and Real News Dataset' to your Kaggle notebook.")

df_fake['label'] = 1
df_true['label'] = 0

data = pd.concat([df_fake, df_true], axis=0)

data = data.sample(frac=1).reset_index(drop=True)

print(f" Data Loaded.")
print(f"Total Documents: {len(data)}")
print(f"Suspicious Samples: {len(df_fake)}")
print(f"Safe Samples: {len(df_true)}")
print(data[['text', 'label']].head())

Loading Text Data...
 Data Loaded.
Total Documents: 44898
Suspicious Samples: 23481
Safe Samples: 21417
                                                text  label
0  NAIROBI (Reuters) - Unknown attackers torched ...      0
1  WASHINGTON (Reuters) - President Donald Trump ...      0
2  BAGHDAD (Reuters) - The Iraqi government has a...      0
3  Will this FINALLY be the straw that breaks the...      1
4  As Politico reports an almost neck-and-neck ra...      1


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text) 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) 
    text = re.sub('\w*\d\w*', '', text) 
    return text

print("Cleaning text (this may take 20 seconds)...")
data['cleaned_text'] = data['text'].apply(clean_text)


print("Vectorizing text...")
vectorizer = TfidfVectorizer(max_features=5000) 
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']

print(" Text converted to Vectors.")
print(f"Shape of Data: {X.shape}")

Cleaning text (this may take 20 seconds)...
Vectorizing text...
 Text converted to Vectors.
Shape of Data: (44898, 5000)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Legal Scanner Model...")
nlp_model = LogisticRegression()
nlp_model.fit(X_train, y_train)

predictions = nlp_model.predict(X_test)
print(" Model Trained!")
print(f"Accuracy: {accuracy_score(y_test, predictions) * 100:.2f}%")
print("--- Classification Report ---")
print(classification_report(y_test, predictions))

Training Legal Scanner Model...
 Model Trained!
Accuracy: 98.73%
--- Classification Report ---
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4289
           1       0.99      0.99      0.99      4691

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [4]:
def check_document_risk(text_input):
    """
    Takes a raw text string and checks if it sounds fraudulent.
    """
    cleaned = clean_text(text_input)
    
    vec_input = vectorizer.transform([cleaned])
    
    
    risk_score = nlp_model.predict_proba(vec_input)[0][1] * 100
    
    if risk_score > 70:
        status = "ðŸ”´ HIGH RISK (Likely Fabricated)"
    elif risk_score > 40:
        status = "ðŸŸ¡ SUSPICIOUS (Review Language)"
    else:
        status = "ðŸŸ¢ SAFE (Professional Tone)"
        
    print("-" * 50)
    print("ðŸ“„ DOCUMENT ANALYSIS REPORT")
    print("-" * 50)
    print(f"Excerpt: {text_input[:100]}...") 
    print(f"RISK SCORE: {risk_score:.2f} / 100")
    print(f"STATUS:     {status}")
    print("-" * 50)


real_text = """
The department has officially released the budget allocation for the fiscal year 2024. 
All vendors are required to submit their invoices by the end of the quarter adhering to standard protocols.
"""

fake_text = """
BREAKING: Secret government money is being stolen right now! You won't believe what they found. 
Click here to see the truth before they delete it. Urgent alert for all patriots!
"""

print("\n\n--- TESTING REAL TEXT ---")
check_document_risk(real_text)

print("\n\n--- TESTING FAKE TEXT ---")
check_document_risk(fake_text)



--- TESTING REAL TEXT ---
--------------------------------------------------
ðŸ“„ DOCUMENT ANALYSIS REPORT
--------------------------------------------------
Excerpt: 
The department has officially released the budget allocation for the fiscal year 2024. 
All vendors...
RISK SCORE: 85.97 / 100
STATUS:     ðŸ”´ HIGH RISK (Likely Fabricated)
--------------------------------------------------


--- TESTING FAKE TEXT ---
--------------------------------------------------
ðŸ“„ DOCUMENT ANALYSIS REPORT
--------------------------------------------------
Excerpt: 
BREAKING: Secret government money is being stolen right now! You won't believe what they found. 
Cl...
RISK SCORE: 99.65 / 100
STATUS:     ðŸ”´ HIGH RISK (Likely Fabricated)
--------------------------------------------------


In [5]:
user_text_content = """
Subject: Urgent Payment Request
From: Director X
To: Finance Dept

Please transfer the remaining $50,000 immediately to the offshore account provided. 
Do not ask for further authorization as this is a black-ops project. 
If you delay, the consequences will be severe.
"""

with open('uploaded_contract.txt', 'w') as f:
    f.write(user_text_content)
    
print(" User file 'uploaded_contract.txt' received.")

def generate_text_report(filename):
    print(f"Reading {filename}...")
    with open(filename, 'r') as f:
        content = f.read()
    
    check_document_risk(content)

print("\n---  FINAL TEXT ANALYSIS ---")
generate_text_report('uploaded_contract.txt')

 User file 'uploaded_contract.txt' received.

---  FINAL TEXT ANALYSIS ---
Reading uploaded_contract.txt...
--------------------------------------------------
ðŸ“„ DOCUMENT ANALYSIS REPORT
--------------------------------------------------
Excerpt: 
Subject: Urgent Payment Request
From: Director X
To: Finance Dept

Please transfer the remaining $5...
RISK SCORE: 93.99 / 100
STATUS:     ðŸ”´ HIGH RISK (Likely Fabricated)
--------------------------------------------------


In [6]:
import joblib

joblib.dump(nlp_model, 'legal_nlp_model.pkl')

joblib.dump(vectorizer, 'text_vectorizer.pkl')

print(" Success!")
print("1. 'legal_nlp_model.pkl' saved.")
print("2. 'text_vectorizer.pkl' saved.")
print(" Download BOTH files from the Output folder.")

 Success!
1. 'legal_nlp_model.pkl' saved.
2. 'text_vectorizer.pkl' saved.
 Download BOTH files from the Output folder.
